Merge pull request #2821 from ducbueno/add-mswells

Reintroduced multisegment wells to OpenCL backend
2025-02-25 18:55:30 -06:00 · 2020-10-01 21:52:25 +02:00 · 2020-10-01 21:52:25 +02:00 · e8c030be17
commit e8c030be17
parent 1109aeecfc fbbb21d482
5 changed files with 62 additions and 45 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -43,13 +43,13 @@ WellContributions::WellContributions(std::string gpu_mode){
 WellContributions::~WellContributions()
 {
 #if HAVE_CUDA
    // delete MultisegmentWellContributions
    for (auto ms : multisegments) {
        delete ms;
    }
    multisegments.clear();
 #if HAVE_CUDA
    if(cuda_gpu){
        freeCudaMemory(); // should come before 'delete[] h_x'
    }
@ -147,15 +147,4 @@ void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsig
    ++num_ms_wells;
 }
 void WellContributions::setReordering(int *toOrder_, bool reorder_)
 {
    this->toOrder = toOrder_;
    this->reorder = reorder_;
    for (auto& ms : multisegments) {
        ms->setReordering(toOrder_, reorder_);
    }
 }
 } //namespace Opm
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -72,25 +72,21 @@ public:
    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
    std::vector<MultisegmentWellContribution*> multisegments;
 #if HAVE_OPENCL
    std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
    std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
    std::vector<unsigned int> h_val_pointers_ocl;
    std::vector<double> h_x_ocl, h_y_ocl;
    int *toOrder = nullptr;
    bool reorder = false;
 #endif
 private:
    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
    std::vector<MultisegmentWellContribution*> multisegments;
    bool opencl_gpu = false;
    bool cuda_gpu = false;
    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
 #if HAVE_CUDA
    bool allocated = false;
    unsigned int num_blocks = 0;             // total number of blocks in all wells
@ -127,10 +123,6 @@ private:
 #endif
 public:
 //#if HAVE_OPENCL
 //    void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
 //#endif
 #if HAVE_CUDA
    /// Set a cudaStream to be used
    /// \param[in] stream           the cudaStream that is used to launch the kernel in
@ -194,12 +186,6 @@ public:
                                         unsigned int DnumBlocks, double *Dvalues,
                                         UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
                                         std::vector<double> &Cvalues);
    /// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
    /// Those indices need to be mapped via toOrder
    /// \param[in] toOrder    array with mappings
    /// \param[in] reorder    whether the columnindices need to be reordered or not
    void setReordering(int *toOrder, bool reorder);
 };
 } //namespace Opm
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
@ -24,14 +24,14 @@
 #include <dune/common/timer.hh>
 #include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
 #include<iostream>
 namespace bda
 {
    using Opm::OpmLog;
    using Dune::Timer;
-    void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int Nb_){
+    void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int N_, int Nb_){
        N = N_;
        Nb = Nb_;
        dim = wellContribs.dim;
        dim_wells = wellContribs.dim_wells;
@ -48,9 +48,6 @@ namespace bda
            s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
            s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
        }
        else{
            num_std_wells = 0;
        }
    }
    void WellContributionsOCLContainer::reinit(Opm::WellContributions &wellContribs){
@ -65,10 +62,10 @@ namespace bda
        s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
    }
-    void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs){
+    void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_){
-        if(num_std_wells > 0){
+        toOrder.insert(toOrder.end(), toOrder_, toOrder_ + Nb);
            toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + Nb);
        if(num_std_wells > 0){
            cl::Event event;
            std::vector<cl::Event> events(7);
            queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
@ -80,6 +77,13 @@ namespace bda
            queue->enqueueWriteBuffer(s.toOrder, CL_FALSE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &events[6]);
            event.waitForEvents(events);
        }
        if(!wellContribs.multisegments.empty()){
            multisegments = std::move(wellContribs.multisegments);
            num_ms_wells = multisegments.size();
            x_msw.reserve(N);
            y_msw.reserve(N);
        }
    }
    void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){
@ -98,6 +102,10 @@ namespace bda
            queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
            event.waitForEvents(events);
        }
        if(!wellContribs.multisegments.empty()){
            multisegments = std::move(wellContribs.multisegments);
        }
    }
    void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){
@ -127,13 +135,42 @@ namespace bda
                                 cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
    }
    void WellContributionsOCLContainer::applyMSWells(cl::Buffer& x, cl::Buffer& y) {
        cl::Event event;
        std::vector<cl::Event> events(2);
        // copy vectors x and y from GPU to CPU
        queue->enqueueReadBuffer(x, CL_FALSE, 0, sizeof(double) * N, x_msw.data(), nullptr, &events[0]);
        queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &events[1]);
        event.waitForEvents(events);
        // actually apply MultisegmentWells
        for(Opm::MultisegmentWellContribution *well: multisegments){
            well->setReordering(toOrder.data(), true);
            well->apply(x_msw.data(), y_msw.data());
        }
        // copy vector y from CPU to GPU
        queue->enqueueWriteBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &event);
        event.wait();
    }
    void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){
        if(num_std_wells > 0){
            applyStdWells(x, y);
        }
        if(num_ms_wells > 0){
            applyMSWells(x, y);
        }
    }
    WellContributionsOCLContainer::~WellContributionsOCLContainer(){
-        toOrder.clear();
+        if(num_ms_wells > 0){
            for (auto ms : multisegments) {
                delete ms;
            }
        }
    }
 } // end namespace bda
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
@ -22,18 +22,22 @@
 #include <opm/simulators/linalg/bda/opencl.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
 namespace bda
 {
    class WellContributionsOCLContainer
    {
    private:
        int N, Nb;
        unsigned int dim, dim_wells;
        unsigned int num_blocks = 0;
        unsigned int num_std_wells = 0;
        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-        int Nb;
+
        std::vector<int> toOrder;
        std::vector<double> x_msw, y_msw;
        std::vector<Opm::MultisegmentWellContribution*> multisegments;
        typedef struct {
            cl::Buffer Cnnzs, Dnnzs, Bnnzs;
@ -51,14 +55,16 @@ namespace bda
        void reinit(Opm::WellContributions &wellContribs);
        void applyStdWells(cl::Buffer& x, cl::Buffer& y);
        void applyMSWells(cl::Buffer& x, cl::Buffer& y);
    public:
        WellContributionsOCLContainer() {};
        ~WellContributionsOCLContainer();
        WellContributionsOCLContainer(const WellContributionsOCLContainer&) = delete;
        void apply(cl::Buffer& x, cl::Buffer& y);
-        void init(Opm::WellContributions &wellContribs, int Nb);
+        void init(Opm::WellContributions &wellContribs, int N, int Nb);
-        void copy_to_gpu(Opm::WellContributions &wellContribs);
+        void copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_);
        void update_on_gpu(Opm::WellContributions &wellContribs);
        void setOpenCLContext(cl::Context *context);
        void setOpenCLQueue(cl::CommandQueue *queue);
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -496,7 +496,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
-        wcontainer->init(wellContribs, Nb);
+        wcontainer->init(wellContribs, N, Nb);
        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -566,8 +566,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
    event.wait();
-    wellContribs.setReordering(toOrder, true);
+    wcontainer->copy_to_gpu(wellContribs, toOrder);
    wcontainer->copy_to_gpu(wellContribs);
    if (verbosity > 2) {
        std::ostringstream out;