Add more waiting for opencl functions, using shortened opencl kernel params in WellContributions

2025-02-25 18:55:30 -06:00 · 2021-03-03 14:04:06 +01:00 · 2021-03-03 14:04:06 +01:00 · 0caae966b8
commit 0caae966b8
parent 688d8ff627
4 changed files with 49 additions and 37 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -74,7 +74,7 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
    this->queue = queue_;
 }

-void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
+void WellContributions::setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
    this->kernel = kernel_;
    this->kernel_no_reorder = kernel_no_reorder_;
 }
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -26,6 +26,7 @@

 #if HAVE_OPENCL
 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
 #endif

 #include <vector>
@ -39,6 +40,9 @@
 namespace Opm
 {

+using bda::stdwell_apply_kernel_type;
+using bda::stdwell_apply_no_reorder_kernel_type;
+
 /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
 /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
 /// So far, StandardWell and MultisegmentWell are supported
@ -92,17 +96,10 @@ private:
    std::vector<MultisegmentWellContribution*> multisegments;

 #if HAVE_OPENCL
-    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                            cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
-                            cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
-    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                            const unsigned int, const unsigned int, cl::Buffer&,
-                            cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
-
    cl::Context *context;
    cl::CommandQueue *queue;
-    kernel_type *kernel;
-    kernel_type_no_reorder *kernel_no_reorder;
+    stdwell_apply_kernel_type *kernel;
+    stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
    std::vector<cl::Event> events;

    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
@ -154,7 +151,7 @@ public:
 #endif

 #if HAVE_OPENCL
-    void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
+    void setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);

    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -45,7 +45,6 @@ template <unsigned int block_size>
 openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
    prec = new Preconditioner(opencl_ilu_reorder, verbosity_);

-    cl_int err = CL_SUCCESS;
    std::ostringstream out;
    try {
        std::vector<cl::Platform> platforms;
@ -331,20 +330,23 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    //applyblockedscaleadd(-1.0, mat, x, r);

    // set initial values
-    cl::Event event;
-    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
-    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
+    events.resize(5);
+    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
+    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
    rho = 1.0;
    alpha = 1.0;
    omega = 1.0;

-    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
-    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
-    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
+    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
+    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
+    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
+    }

    norm = norm_w(d_r, d_tmp);
    norm_0 = norm;
@ -591,7 +593,7 @@ void openclSolverBackend<block_size>::finalize() {
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::copy_system_to_gpu() {
    Timer t;
-    cl::Event event;
+    events.resize(5);

 #if COPY_ROW_BY_ROW
    int sum = 0;
@ -600,19 +602,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
 #else
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif

-    queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
-    queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
-    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
    if (opencl_ilu_reorder != ILUReorder::NONE) {
-        queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
+        events.resize(6);
+        queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]);
+    }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
    }
-    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();

    if (verbosity > 2) {
        std::ostringstream out;
@ -625,7 +633,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::update_system_on_gpu() {
    Timer t;
-    cl::Event event;
+    events.resize(3);

 #if COPY_ROW_BY_ROW
    int sum = 0;
@ -634,14 +642,19 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
 #else
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif

-    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
-    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
+    }

    if (verbosity > 2) {
        std::ostringstream out;
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -82,6 +82,8 @@ private:
    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix (or original if no reordering), used for spmv
    ILUReorder opencl_ilu_reorder;                                // reordering strategy
+    std::vector<cl::Event> events;
+    cl_int err;

    /// Divide A by B, and round up: return (int)ceil(A/B)
    /// \param[in] A    dividend