Merge pull request #2821 from ducbueno/add-mswells

Reintroduced multisegment wells to OpenCL backend
2025-02-25 18:55:30 -06:00 · 2020-10-01 21:52:25 +02:00 · 2020-10-01 21:52:25 +02:00 · e8c030be17
commit e8c030be17
parent 1109aeecfc fbbb21d482
5 changed files with 62 additions and 45 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -43,13 +43,13 @@ WellContributions::WellContributions(std::string gpu_mode){

 WellContributions::~WellContributions()
 {
+#if HAVE_CUDA
    // delete MultisegmentWellContributions
    for (auto ms : multisegments) {
        delete ms;
    }
    multisegments.clear();

-#if HAVE_CUDA
    if(cuda_gpu){
        freeCudaMemory(); // should come before 'delete[] h_x'
    }
@ -147,15 +147,4 @@ void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsig
    ++num_ms_wells;
 }

-
-void WellContributions::setReordering(int *toOrder_, bool reorder_)
-{
-    this->toOrder = toOrder_;
-    this->reorder = reorder_;
-    for (auto& ms : multisegments) {
-        ms->setReordering(toOrder_, reorder_);
-    }
-}
-
 } //namespace Opm
-
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -72,25 +72,21 @@ public:

    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
+    std::vector<MultisegmentWellContribution*> multisegments;

 #if HAVE_OPENCL
    std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
    std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
    std::vector<unsigned int> h_val_pointers_ocl;
-    std::vector<double> h_x_ocl, h_y_ocl;
-
-    int *toOrder = nullptr;
-    bool reorder = false;
 #endif

 private:
-    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
-    std::vector<MultisegmentWellContribution*> multisegments;
-
    bool opencl_gpu = false;
    bool cuda_gpu = false;

+    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
+    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
+
 #if HAVE_CUDA
    bool allocated = false;
    unsigned int num_blocks = 0;             // total number of blocks in all wells
@ -127,10 +123,6 @@ private:
 #endif

 public:
-//#if HAVE_OPENCL
-//    void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
-//#endif
-
 #if HAVE_CUDA
    /// Set a cudaStream to be used
    /// \param[in] stream           the cudaStream that is used to launch the kernel in
@ -194,12 +186,6 @@ public:
                                         unsigned int DnumBlocks, double *Dvalues,
                                         UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
                                         std::vector<double> &Cvalues);
-
-    /// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
-    /// Those indices need to be mapped via toOrder
-    /// \param[in] toOrder    array with mappings
-    /// \param[in] reorder    whether the columnindices need to be reordered or not
-    void setReordering(int *toOrder, bool reorder);
 };

 } //namespace Opm
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
@ -24,14 +24,14 @@
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
-#include<iostream>

 namespace bda
 {
    using Opm::OpmLog;
    using Dune::Timer;

-    void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int Nb_){
+    void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int N_, int Nb_){
+        N = N_;
        Nb = Nb_;
        dim = wellContribs.dim;
        dim_wells = wellContribs.dim_wells;
@ -48,9 +48,6 @@ namespace bda
            s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
            s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
        }
-        else{
-            num_std_wells = 0;
-        }
    }

    void WellContributionsOCLContainer::reinit(Opm::WellContributions &wellContribs){
@ -65,10 +62,10 @@ namespace bda
        s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
    }

-    void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs){
-        if(num_std_wells > 0){
-            toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + Nb);
+    void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_){
+        toOrder.insert(toOrder.end(), toOrder_, toOrder_ + Nb);

+        if(num_std_wells > 0){
            cl::Event event;
            std::vector<cl::Event> events(7);
            queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
@ -80,6 +77,13 @@ namespace bda
            queue->enqueueWriteBuffer(s.toOrder, CL_FALSE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &events[6]);
            event.waitForEvents(events);
        }
+
+        if(!wellContribs.multisegments.empty()){
+            multisegments = std::move(wellContribs.multisegments);
+            num_ms_wells = multisegments.size();
+            x_msw.reserve(N);
+            y_msw.reserve(N);
+        }
    }

    void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){
@ -98,6 +102,10 @@ namespace bda
            queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
            event.waitForEvents(events);
        }
+
+        if(!wellContribs.multisegments.empty()){
+            multisegments = std::move(wellContribs.multisegments);
+        }
    }

    void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){
@ -127,13 +135,42 @@ namespace bda
                                 cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
    }

+
+    void WellContributionsOCLContainer::applyMSWells(cl::Buffer& x, cl::Buffer& y) {
+        cl::Event event;
+        std::vector<cl::Event> events(2);
+
+        // copy vectors x and y from GPU to CPU
+        queue->enqueueReadBuffer(x, CL_FALSE, 0, sizeof(double) * N, x_msw.data(), nullptr, &events[0]);
+        queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &events[1]);
+        event.waitForEvents(events);
+
+        // actually apply MultisegmentWells
+        for(Opm::MultisegmentWellContribution *well: multisegments){
+            well->setReordering(toOrder.data(), true);
+            well->apply(x_msw.data(), y_msw.data());
+        }
+
+        // copy vector y from CPU to GPU
+        queue->enqueueWriteBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &event);
+        event.wait();
+    }
+
    void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){
        if(num_std_wells > 0){
            applyStdWells(x, y);
        }
+
+        if(num_ms_wells > 0){
+            applyMSWells(x, y);
+        }
    }

    WellContributionsOCLContainer::~WellContributionsOCLContainer(){
-        toOrder.clear();
+        if(num_ms_wells > 0){
+            for (auto ms : multisegments) {
+                delete ms;
+            }
+        }
    }
 } // end namespace bda
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
@ -22,18 +22,22 @@

 #include <opm/simulators/linalg/bda/opencl.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
+#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>

 namespace bda
 {
    class WellContributionsOCLContainer
    {
    private:
+        int N, Nb;
        unsigned int dim, dim_wells;
        unsigned int num_blocks = 0;
        unsigned int num_std_wells = 0;
        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-        int Nb;
+
        std::vector<int> toOrder;
+        std::vector<double> x_msw, y_msw;
+        std::vector<Opm::MultisegmentWellContribution*> multisegments;

        typedef struct {
            cl::Buffer Cnnzs, Dnnzs, Bnnzs;
@ -51,14 +55,16 @@ namespace bda

        void reinit(Opm::WellContributions &wellContribs);
        void applyStdWells(cl::Buffer& x, cl::Buffer& y);
+        void applyMSWells(cl::Buffer& x, cl::Buffer& y);

    public:
        WellContributionsOCLContainer() {};
        ~WellContributionsOCLContainer();
+        WellContributionsOCLContainer(const WellContributionsOCLContainer&) = delete;

        void apply(cl::Buffer& x, cl::Buffer& y);
-        void init(Opm::WellContributions &wellContribs, int Nb);
-        void copy_to_gpu(Opm::WellContributions &wellContribs);
+        void init(Opm::WellContributions &wellContribs, int N, int Nb);
+        void copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_);
        void update_on_gpu(Opm::WellContributions &wellContribs);
        void setOpenCLContext(cl::Context *context);
        void setOpenCLQueue(cl::CommandQueue *queue);
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -496,7 +496,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));

-        wcontainer->init(wellContribs, Nb);
+        wcontainer->init(wellContribs, N, Nb);

        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -566,8 +566,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
    event.wait();

-    wellContribs.setReordering(toOrder, true);
-    wcontainer->copy_to_gpu(wellContribs);
+    wcontainer->copy_to_gpu(wellContribs, toOrder);

    if (verbosity > 2) {
        std::ostringstream out;