Initial commit

2025-02-25 18:55:30 -06:00 · 2020-09-03 09:46:44 -03:00
parent 4d10d9ac76
commit c7adc3495f
4 changed files with 40 additions and 31 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -102,8 +102,19 @@ WellContributions::~WellContributions()
 }

 #if HAVE_OPENCL
+void WellContributions::setOpenCLContext(cl::Context *context_){
+    this->context = context_;
+}

-void WellContributions::init(cl::Context *context){
+void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
+    this->queue = queue_;
+}
+
+void WellContributions::setKernel(kernel_type *stdwell_apply_){
+    this->stdwell_apply = stdwell_apply_;
+}
+
+void WellContributions::init(){
    if(num_std_wells > 0){
        d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
        d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
@@ -111,24 +122,17 @@ void WellContributions::init(cl::Context *context){
        d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
        d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
        d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
-    }
-}
-
-void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
-    if(num_std_wells > 0){
-        cl::Event event;

        queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
        queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
        queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
        queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
        queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
-        queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
-        event.wait();
+        queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
    }
 }

-void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
+void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
    // apply MultisegmentWells
    if (num_ms_wells > 0) {
        // allocate pinned memory on host if not yet done
@@ -151,26 +155,25 @@ void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl
    }
 }

-void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
+void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
    const unsigned int work_group_size = 32;
    const unsigned int total_work_items = num_std_wells * work_group_size;
    const unsigned int lmem1 = sizeof(double) * work_group_size;
    const unsigned int lmem2 = sizeof(double) * dim_wells;

    cl::Event event;
-    event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                      d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
-                      d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
-    event.wait();
+    event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                             d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
+                             d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
 }

-void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
+void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
    if(num_std_wells > 0){
-        applyStdWell(queue, d_x, d_y, kernel);
+        applyStdWell(d_x, d_y);
    }

    if(num_ms_wells > 0){
-        applyMSWell(queue, d_x, d_y);
+        applyMSWell(d_x, d_y);
    }
 }

--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -115,6 +115,9 @@ private:
                            cl::Buffer&, cl::Buffer&, cl::Buffer&,
                            cl::Buffer&, const unsigned int, const unsigned int,
                            cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
+    kernel_type *stdwell_apply;
+    cl::Context *context;
+    cl::CommandQueue *queue;
 #endif

 #if HAVE_CUDA
@@ -133,8 +136,8 @@ private:
 #endif

 #if HAVE_OPENCL
-    void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
-    void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
+    void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
+    void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
 #endif

 public:
@@ -155,9 +158,11 @@ public:
 #endif

 #if HAVE_OPENCL
-    void init(cl::Context *context);
-    void copyDataToGPU(cl::CommandQueue *queue);
-    void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
+    void init();
+    void apply(cl::Buffer& x, cl::Buffer& y);
+    void setOpenCLContext(cl::Context *context);
+    void setOpenCLQueue(cl::CommandQueue *queue);
+    void setKernel(kernel_type *stdwell_apply);
 #endif

    /// Create a new WellContributions
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -242,7 +242,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        t_well.start();
-        wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
+        wellContribs.apply(d_pw, d_v);
        t_well.stop();

        t_rest.start();
@@ -271,7 +271,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        t_well.start();
-        wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
+        wellContribs.apply(d_s, d_t);
        t_well.stop();

        t_rest.start();
@@ -495,7 +495,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));

-        wellContribs.init(context.get());
+        wellContribs.setOpenCLContext(context.get());
+        wellContribs.setOpenCLQueue(queue.get());
+        wellContribs.init();

        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -511,6 +513,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));

        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
+        wellContribs.setKernel(add_well_contributions_k.get());

    } catch (const cl::Error& error) {
        std::ostringstream oss;
@@ -539,7 +542,7 @@ void openclSolverBackend<block_size>::finalize() {


 template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
+void openclSolverBackend<block_size>::copy_system_to_gpu() {
    Timer t;
    cl::Event event;

@@ -561,8 +564,6 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& well
    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
    event.wait();

-    wellContribs.copyDataToGPU(queue.get());
-
    if (verbosity > 2) {
        std::ostringstream out;
        out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@@ -716,7 +717,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
        if (!create_preconditioner()) {
            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
-        copy_system_to_gpu(wellContribs);
+        copy_system_to_gpu();
    } else {
        update_system(vals, b);
        if (!create_preconditioner()) {
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -155,7 +155,7 @@ private:
    void finalize();

    /// Copy linear system to GPU
-    void copy_system_to_gpu(WellContributions& wellContribs);
+    void copy_system_to_gpu();

    /// Reorder the linear system so it corresponds with the coloring
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values