Fixed kernel; fixed time step chopping;

2025-02-25 18:55:30 -06:00 · 2020-09-07 13:58:48 -03:00 · 2020-09-07 13:58:48 -03:00 · c43648076f
commit c43648076f
parent c41aafcc58
5 changed files with 91 additions and 119 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -101,37 +101,8 @@ WellContributions::~WellContributions()
 #endif
 }

+/*
 #if HAVE_OPENCL
-void WellContributions::setOpenCLContext(cl::Context *context_){
-    this->context = context_;
-}
-
-void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
-    this->queue = queue_;
-}
-
-void WellContributions::setKernel(kernel_type *stdwell_apply_){
-    this->stdwell_apply = stdwell_apply_;
-}
-
-void WellContributions::init(){
-    if(num_std_wells > 0){
-        d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-        d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-        d_Bnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-        d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-        d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-        d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
-
-        queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
-        queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
-        queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
-        queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
-        queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
-        queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
-    }
-}
-
 void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
    // apply MultisegmentWells
    if (num_ms_wells > 0) {
@ -154,30 +125,8 @@ void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
    }
 }
-
-void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
-    const unsigned int work_group_size = 32;
-    const unsigned int total_work_items = num_std_wells * work_group_size;
-    const unsigned int lmem1 = sizeof(double) * work_group_size;
-    const unsigned int lmem2 = sizeof(double) * dim_wells;
-
-    cl::Event event;
-    event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                             d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
-                             d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
-}
-
-void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
-    if(num_std_wells > 0){
-        applyStdWell(d_x, d_y);
-    }
-
-    if(num_ms_wells > 0){
-        applyMSWell(d_x, d_y);
-    }
-}
-
 #endif
+*/

 void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
 {
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -64,7 +64,6 @@ public:
        B
    };

-private:
    unsigned int num_blocks = 0;             // total number of blocks in all wells
    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
@ -73,6 +72,18 @@ private:
    unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
    unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
+
+#if HAVE_OPENCL
+    double *h_Cnnzs_ocl = nullptr;
+    double *h_Dnnzs_ocl = nullptr;
+    double *h_Bnnzs_ocl = nullptr;
+    int *h_Ccols_ocl = nullptr;
+    int *h_Bcols_ocl = nullptr;
+    double *h_x_ocl = nullptr;
+    double *h_y_ocl = nullptr;
+#endif
+
+private:
    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
    bool allocated = false;
    std::vector<MultisegmentWellContribution*> multisegments;
@ -97,30 +108,7 @@ private:
    unsigned int *d_val_pointers = nullptr;
    double *h_x = nullptr;
    double *h_y = nullptr;
-#endif

-#if HAVE_OPENCL
-    double *h_Cnnzs_ocl = nullptr;
-    double *h_Dnnzs_ocl = nullptr;
-    double *h_Bnnzs_ocl = nullptr;
-    int *h_Ccols_ocl = nullptr;
-    int *h_Bcols_ocl = nullptr;
-    double *h_x_ocl = nullptr;
-    double *h_y_ocl = nullptr;
-
-    cl::Buffer d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
-    cl::Buffer d_Ccols_ocl, d_Bcols_ocl, d_val_pointers_ocl;
-
-    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                            cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                            cl::Buffer&, const unsigned int, const unsigned int,
-                            cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
-    kernel_type *stdwell_apply;
-    cl::Context *context;
-    cl::CommandQueue *queue;
-#endif
-
-#if HAVE_CUDA
    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
    /// \param[in] type        indicate if C, D or B is sent
    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
@ -135,12 +123,11 @@ private:
    void freeCudaMemory();
 #endif

-#if HAVE_OPENCL
-    void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
-    void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
-#endif
-
 public:
+//#if HAVE_OPENCL
+//    void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
+//#endif
+
 #if HAVE_CUDA
    /// Set a cudaStream to be used
    /// \param[in] stream           the cudaStream that is used to launch the kernel in
@ -157,14 +144,6 @@ public:
    }
 #endif

-#if HAVE_OPENCL
-    void init();
-    void apply(cl::Buffer& x, cl::Buffer& y);
-    void setOpenCLContext(cl::Context *context);
-    void setOpenCLQueue(cl::CommandQueue *queue);
-    void setKernel(kernel_type *stdwell_apply);
-#endif
-
    /// Create a new WellContributions
    WellContributions(std::string gpu_mode);

--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@ -455,7 +455,7 @@ namespace bda
            for (unsigned int j = 0; j < blnr; ++j){
                temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j];
            }
-            atomicAdd(&y[colIdx*blnc + c], temp);
+            atomicAdd(&y[colIdx*blnc + c], -temp);
        }
    }
    )";
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -181,7 +181,21 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
 }

 template <unsigned int block_size>
-void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+void openclSolverBackend<block_size>::stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int total_work_items = num_std_wells * work_group_size;
+    const unsigned int lmem1 = sizeof(double) * work_group_size;
+    const unsigned int lmem2 = sizeof(double) * dim_wells;
+
+    cl::Event event;
+    event = (*add_well_contributions_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                                        Cnnzs, Dnnzs, Bnnzs, Ccols, Bcols, x, y, dim_weqs, dim_wells, val_pointers,
+                                        cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+}
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
    float it;
    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
    double norm, norm_0;
@ -208,8 +222,6 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
    event.wait();

-    wellContribs.setReordering(toOrder, true);
-
    norm = norm_w(d_r, d_tmp);
    norm_0 = norm;

@ -242,7 +254,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        t_well.start();
-        wellContribs.apply(d_pw, d_v);
+        stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_pw, d_v, d_val_pointers);
        t_well.stop();

        t_rest.start();
@ -271,7 +283,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        t_well.start();
-        wellContribs.apply(d_s, d_t);
+        stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_s, d_t, d_val_pointers);
        t_well.stop();

        t_rest.start();
@ -506,6 +518,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
+        add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));

        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());

@ -525,12 +538,19 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub

 template <unsigned int block_size>
 void openclSolverBackend<block_size>::initialize_wellContribs(WellContributions& wellContribs){
-    add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));
+    dim_weqs = wellContribs.dim;
+    dim_wells = wellContribs.dim_wells;
+    num_blocks = wellContribs.num_blocks;
+    num_std_wells = wellContribs.num_std_wells;

-    wellContribs.setOpenCLContext(context.get());
-    wellContribs.setOpenCLQueue(queue.get());
-    wellContribs.init();
-    wellContribs.setKernel(add_well_contributions_k.get());
+    if(num_std_wells > 0){
+        d_Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells);
+        d_Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+        d_Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells);
+        d_Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+        d_Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+        d_val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
+    }
 }

 template <unsigned int block_size>
@ -574,6 +594,29 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
    }
 } // end copy_system_to_gpu()

+template <unsigned int block_size>
+void openclSolverBackend<block_size>::copy_wells_to_gpu(WellContributions& wellContribs) {
+    wellContribs.setReordering(toOrder, true);
+
+    if(num_std_wells > 0){
+        Timer t;
+        cl::Event event;
+
+        queue->enqueueWriteBuffer(d_Cnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Cnnzs_ocl);
+        queue->enqueueWriteBuffer(d_Dnnzs, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, wellContribs.h_Dnnzs_ocl);
+        queue->enqueueWriteBuffer(d_Bnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Bnnzs_ocl);
+        queue->enqueueWriteBuffer(d_Ccols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Ccols_ocl);
+        queue->enqueueWriteBuffer(d_Bcols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Bcols_ocl);
+        queue->enqueueWriteBuffer(d_val_pointers, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), wellContribs.val_pointers, nullptr, &event);
+        event.wait();
+
+        if (verbosity > 2) {
+            std::ostringstream out;
+            out << "openclSolver::copy_wells_to_gpu(): " << t.stop() << " s";
+            OpmLog::info(out.str());
+        }
+    }
+}

 // don't copy rowpointers and colindices, they stay the same
 template <unsigned int block_size>
@ -663,12 +706,12 @@ bool openclSolverBackend<block_size>::create_preconditioner() {


 template <unsigned int block_size>
-void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
+void openclSolverBackend<block_size>::solve_system(BdaResult &res) {
    Timer t;

    // actually solve
    try {
-        gpu_pbicgstab(wellContribs, res);
+        gpu_pbicgstab(res);
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
@ -710,6 +753,7 @@ template <unsigned int block_size>
 SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
    if (initialized == false) {
        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize_wellContribs(wellContribs);
        if (analysis_done == false) {
            if (!analyse_matrix()) {
                return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@ -720,6 +764,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
        copy_system_to_gpu();
+        copy_wells_to_gpu(wellContribs);
    } else {
        update_system(vals, b);
        if (!create_preconditioner()) {
@ -727,8 +772,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
        }
        update_system_on_gpu();
    }
-    initialize_wellContribs(wellContribs);
-    solve_system(wellContribs, res);
+    solve_system(res);
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -64,12 +64,12 @@ private:
    cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
    double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()

-    //unsigned int num_blocks, dim_, dim_wells, num_std_wells;
-    //unsigned int *h_val_pointers;
-    //int *h_Ccols, *h_Bcols;
-    //double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs;
-    //cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs;
-    //cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
+    unsigned int num_blocks, dim_weqs, dim_wells, num_std_wells;
+    unsigned int *h_val_pointers;
+    int *h_Ccols, *h_Bcols;
+    double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs;
+    cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs;
+    cl::Buffer d_Ccols, d_Bcols, d_val_pointers;

    // shared pointers are also passed to other objects
    cl::Program program;
@ -84,12 +84,10 @@ private:
    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > add_well_contributions_k;

-    Preconditioner *prec = nullptr;                  // only supported preconditioner is BILU0
-    int *toOrder = nullptr, *fromOrder = nullptr;    // BILU0 reorders rows of the matrix via these mappings
+    Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
+    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
-    BlockedMatrix<block_size> *rmat = nullptr;       // reordered matrix, used for spmv
-
-
+    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix, used for spmv

    /// Divide A by B, and round up: return (int)ceil(A/B)
    /// \param[in] A    dividend
@ -136,12 +134,12 @@ private:
    /// \param[out] b       output vector
    void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);

-    //void add_well_contributions_w(cl::Buffer valsC, cl::Buffer valsD, cl::Buffer valsB, cl::Buffer colsC, cl::Buffer colsB, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers);
+    void stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers);

    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(BdaResult& res);

    /// Initialize GPU and allocate memory
    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
@ -160,6 +158,8 @@ private:
    /// Copy linear system to GPU
    void copy_system_to_gpu();

+    void copy_wells_to_gpu(WellContributions& wellContribs);
+
    /// Reorder the linear system so it corresponds with the coloring
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] b              input vectors, contains N values
@ -179,7 +179,7 @@ private:
    /// Solve linear system
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions& wellContribs, BdaResult &res);
+    void solve_system(BdaResult &res);

 public: