Merge pull request #2762 from ducbueno/opencl-stdwell-clean

Fixed out of resources problem
2025-02-25 18:55:30 -06:00 · 2020-09-03 19:50:39 +02:00 · 2020-09-03 19:50:39 +02:00 · 53005c477d
commit 53005c477d
parent 3ee0db8cf4 525faf7b5d
4 changed files with 63 additions and 47 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -102,8 +102,19 @@ WellContributions::~WellContributions()
 }

 #if HAVE_OPENCL
+void WellContributions::setOpenCLContext(cl::Context *context_){
+    this->context = context_;
+}

-void WellContributions::init(cl::Context *context){
+void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
+    this->queue = queue_;
+}
+
+void WellContributions::setKernel(kernel_type *stdwell_apply_){
+    this->stdwell_apply = stdwell_apply_;
+}
+
+void WellContributions::init(){
    if(num_std_wells > 0){
        d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
        d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
@ -111,24 +122,17 @@ void WellContributions::init(cl::Context *context){
        d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
        d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
        d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
-    }
-}
-
-void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
-    if(num_std_wells > 0){
-        cl::Event event;

        queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
        queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
        queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
        queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
        queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
-        queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
-        event.wait();
+        queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
    }
 }

-void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
+void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
    // apply MultisegmentWells
    if (num_ms_wells > 0) {
        // allocate pinned memory on host if not yet done
@ -151,26 +155,25 @@ void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl
    }
 }

-void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
+void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
    const unsigned int work_group_size = 32;
    const unsigned int total_work_items = num_std_wells * work_group_size;
    const unsigned int lmem1 = sizeof(double) * work_group_size;
    const unsigned int lmem2 = sizeof(double) * dim_wells;

    cl::Event event;
-    event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                      d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
-                      d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
-    event.wait();
+    event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                             d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
+                             d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
 }

-void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
+void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
    if(num_std_wells > 0){
-        applyStdWell(queue, d_x, d_y, kernel);
+        applyStdWell(d_x, d_y);
    }

    if(num_ms_wells > 0){
-        applyMSWell(queue, d_x, d_y);
+        applyMSWell(d_x, d_y);
    }
 }

--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -115,6 +115,9 @@ private:
                            cl::Buffer&, cl::Buffer&, cl::Buffer&,
                            cl::Buffer&, const unsigned int, const unsigned int,
                            cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
+    kernel_type *stdwell_apply;
+    cl::Context *context;
+    cl::CommandQueue *queue;
 #endif

 #if HAVE_CUDA
@ -133,8 +136,8 @@ private:
 #endif

 #if HAVE_OPENCL
-    void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
-    void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
+    void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
+    void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
 #endif

 public:
@ -155,9 +158,11 @@ public:
 #endif

 #if HAVE_OPENCL
-    void init(cl::Context *context);
-    void copyDataToGPU(cl::CommandQueue *queue);
-    void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
+    void init();
+    void apply(cl::Buffer& x, cl::Buffer& y);
+    void setOpenCLContext(cl::Context *context);
+    void setOpenCLQueue(cl::CommandQueue *queue);
+    void setKernel(kernel_type *stdwell_apply);
 #endif

    /// Create a new WellContributions
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -242,7 +242,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        t_well.start();
-        wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
+        wellContribs.apply(d_pw, d_v);
        t_well.stop();

        t_rest.start();
@ -271,7 +271,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        t_well.start();
-        wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
+        wellContribs.apply(d_s, d_t);
        t_well.stop();

        t_rest.start();
@ -319,7 +319,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr


 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs) {
+void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
    this->N = N_;
    this->nnz = nnz_;
    this->nnzb = nnz_ / block_size / block_size;
@ -462,9 +462,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
        source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
        source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
-        cl::Program program_ = cl::Program(*context, source);
+        program = cl::Program(*context, source);

-        program_.build(devices);
+        program.build(devices);

        cl::Event event;
        queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
@ -495,20 +495,17 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));

-        wellContribs.init(context.get());
-
        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking

        // actually creating the kernels
-        dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "dot_1")));
-        norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "norm")));
-        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program_, "axpy")));
-        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program_, "custom")));
-        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
-        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
-        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
-        add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));
+        dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
+        norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
+        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
+        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
+        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
+        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
+        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));

        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());

@ -523,10 +520,19 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        throw error;
    }

-
    initialized = true;
 } // end initialize()

+template <unsigned int block_size>
+void openclSolverBackend<block_size>::initialize_wellContribs(WellContributions& wellContribs){
+    add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));
+
+    wellContribs.setOpenCLContext(context.get());
+    wellContribs.setOpenCLQueue(queue.get());
+    wellContribs.init();
+    wellContribs.setKernel(add_well_contributions_k.get());
+}
+
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
    delete[] rb;
@ -539,7 +545,7 @@ void openclSolverBackend<block_size>::finalize() {


 template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
+void openclSolverBackend<block_size>::copy_system_to_gpu() {
    Timer t;
    cl::Event event;

@ -561,8 +567,6 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& well
    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
    event.wait();

-    wellContribs.copyDataToGPU(queue.get());
-
    if (verbosity > 2) {
        std::ostringstream out;
        out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@ -702,11 +706,11 @@ void openclSolverBackend<block_size>::get_result(double *x) {
 } // end get_result()


-
 template <unsigned int block_size>
 SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
    if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols, wellContribs);
+        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize_wellContribs(wellContribs);
        if (analysis_done == false) {
            if (!analyse_matrix()) {
                return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@ -716,9 +720,10 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
        if (!create_preconditioner()) {
            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
-        copy_system_to_gpu(wellContribs);
+        copy_system_to_gpu();
    } else {
        update_system(vals, b);
+        initialize_wellContribs(wellContribs);
        if (!create_preconditioner()) {
            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -72,6 +72,7 @@ private:
    //cl::Buffer d_Ccols, d_Bcols, d_val_pointers;

    // shared pointers are also passed to other objects
+    cl::Program program;
    std::shared_ptr<cl::Context> context;
    std::shared_ptr<cl::CommandQueue> queue;
    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
@ -149,13 +150,15 @@ private:
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
    /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs);
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+
+    void initialize_wellContribs(WellContributions& wellContribs);

    /// Clean memory
    void finalize();

    /// Copy linear system to GPU
-    void copy_system_to_gpu(WellContributions& wellContribs);
+    void copy_system_to_gpu();

    /// Reorder the linear system so it corresponds with the coloring
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values