diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index ab89f77f6..b8ec7941e 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -101,37 +101,8 @@ WellContributions::~WellContributions() #endif } +/* #if HAVE_OPENCL -void WellContributions::setOpenCLContext(cl::Context *context_){ - this->context = context_; -} - -void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){ - this->queue = queue_; -} - -void WellContributions::setKernel(kernel_type *stdwell_apply_){ - this->stdwell_apply = stdwell_apply_; -} - -void WellContributions::init(){ - if(num_std_wells > 0){ - d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells); - d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells); - d_Bnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells); - d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); - d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); - d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1)); - - queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl); - queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl); - queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl); - queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl); - queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl); - queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers); - } -} - void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) { // apply MultisegmentWells if (num_ms_wells > 0) { @@ -154,30 +125,8 @@ void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) { queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl); } } - -void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){ - const unsigned int work_group_size = 32; - const unsigned int total_work_items = num_std_wells * work_group_size; - const unsigned int lmem1 = sizeof(double) * work_group_size; - const unsigned int lmem2 = sizeof(double) * dim_wells; - - cl::Event event; - event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, - d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); -} - -void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){ - if(num_std_wells > 0){ - applyStdWell(d_x, d_y); - } - - if(num_ms_wells > 0){ - applyMSWell(d_x, d_y); - } -} - #endif +*/ void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size) { diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index c51823103..1467cbee7 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -64,7 +64,6 @@ public: B }; -private: unsigned int num_blocks = 0; // total number of blocks in all wells unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq @@ -73,6 +72,18 @@ private: unsigned int num_blocks_so_far = 0; // keep track of where next data is written unsigned int num_std_wells_so_far = 0; // keep track of where next data is written unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols + +#if HAVE_OPENCL + double *h_Cnnzs_ocl = nullptr; + double *h_Dnnzs_ocl = nullptr; + double *h_Bnnzs_ocl = nullptr; + int *h_Ccols_ocl = nullptr; + int *h_Bcols_ocl = nullptr; + double *h_x_ocl = nullptr; + double *h_y_ocl = nullptr; +#endif + +private: unsigned int N; // number of rows (not blockrows) in vectors x and y bool allocated = false; std::vector multisegments; @@ -97,30 +108,7 @@ private: unsigned int *d_val_pointers = nullptr; double *h_x = nullptr; double *h_y = nullptr; -#endif -#if HAVE_OPENCL - double *h_Cnnzs_ocl = nullptr; - double *h_Dnnzs_ocl = nullptr; - double *h_Bnnzs_ocl = nullptr; - int *h_Ccols_ocl = nullptr; - int *h_Bcols_ocl = nullptr; - double *h_x_ocl = nullptr; - double *h_y_ocl = nullptr; - - cl::Buffer d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; - cl::Buffer d_Ccols_ocl, d_Bcols_ocl, d_val_pointers_ocl; - - typedef cl::make_kernel kernel_type; - kernel_type *stdwell_apply; - cl::Context *context; - cl::CommandQueue *queue; -#endif - -#if HAVE_CUDA /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called /// \param[in] type indicate if C, D or B is sent /// \param[in] colIndices columnindices of blocks in C or B, ignored for D @@ -135,12 +123,11 @@ private: void freeCudaMemory(); #endif -#if HAVE_OPENCL - void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y); - void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y); -#endif - public: +//#if HAVE_OPENCL +// void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y); +//#endif + #if HAVE_CUDA /// Set a cudaStream to be used /// \param[in] stream the cudaStream that is used to launch the kernel in @@ -157,14 +144,6 @@ public: } #endif -#if HAVE_OPENCL - void init(); - void apply(cl::Buffer& x, cl::Buffer& y); - void setOpenCLContext(cl::Context *context); - void setOpenCLQueue(cl::CommandQueue *queue); - void setKernel(kernel_type *stdwell_apply); -#endif - /// Create a new WellContributions WellContributions(std::string gpu_mode); diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index 30dbfcd63..933634a9f 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -455,7 +455,7 @@ namespace bda for (unsigned int j = 0; j < blnr; ++j){ temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j]; } - atomicAdd(&y[colIdx*blnc + c], temp); + atomicAdd(&y[colIdx*blnc + c], -temp); } } )"; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 71a48770c..749ddb905 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -181,7 +181,21 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer } template -void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { +void openclSolverBackend::stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers) +{ + const unsigned int work_group_size = 32; + const unsigned int total_work_items = num_std_wells * work_group_size; + const unsigned int lmem1 = sizeof(double) * work_group_size; + const unsigned int lmem2 = sizeof(double) * dim_wells; + + cl::Event event; + event = (*add_well_contributions_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + Cnnzs, Dnnzs, Bnnzs, Ccols, Bcols, x, y, dim_weqs, dim_wells, val_pointers, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); +} + +template +void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { float it; double rho, rhop, beta, alpha, omega, tmp1, tmp2; double norm, norm_0; @@ -208,8 +222,6 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); - wellContribs.setReordering(toOrder, true); - norm = norm_w(d_r, d_tmp); norm_0 = norm; @@ -242,7 +254,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr // apply wellContributions t_well.start(); - wellContribs.apply(d_pw, d_v); + stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_pw, d_v, d_val_pointers); t_well.stop(); t_rest.start(); @@ -271,7 +283,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr // apply wellContributions t_well.start(); - wellContribs.apply(d_s, d_t); + stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_s, d_t, d_val_pointers); t_well.stop(); t_rest.start(); @@ -506,6 +518,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub spmv_blocked_k.reset(new cl::make_kernel(cl::Kernel(program, "spmv_blocked"))); ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply1"))); ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply2"))); + add_well_contributions_k.reset(new cl::make_kernel(cl::Kernel(program, "add_well_contributions"))); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); @@ -525,12 +538,19 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub template void openclSolverBackend::initialize_wellContribs(WellContributions& wellContribs){ - add_well_contributions_k.reset(new cl::make_kernel(cl::Kernel(program, "add_well_contributions"))); + dim_weqs = wellContribs.dim; + dim_wells = wellContribs.dim_wells; + num_blocks = wellContribs.num_blocks; + num_std_wells = wellContribs.num_std_wells; - wellContribs.setOpenCLContext(context.get()); - wellContribs.setOpenCLQueue(queue.get()); - wellContribs.init(); - wellContribs.setKernel(add_well_contributions_k.get()); + if(num_std_wells > 0){ + d_Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells); + d_Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells); + d_Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells); + d_Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); + d_Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); + d_val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1)); + } } template @@ -574,6 +594,29 @@ void openclSolverBackend::copy_system_to_gpu() { } } // end copy_system_to_gpu() +template +void openclSolverBackend::copy_wells_to_gpu(WellContributions& wellContribs) { + wellContribs.setReordering(toOrder, true); + + if(num_std_wells > 0){ + Timer t; + cl::Event event; + + queue->enqueueWriteBuffer(d_Cnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Cnnzs_ocl); + queue->enqueueWriteBuffer(d_Dnnzs, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, wellContribs.h_Dnnzs_ocl); + queue->enqueueWriteBuffer(d_Bnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Bnnzs_ocl); + queue->enqueueWriteBuffer(d_Ccols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Ccols_ocl); + queue->enqueueWriteBuffer(d_Bcols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Bcols_ocl); + queue->enqueueWriteBuffer(d_val_pointers, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), wellContribs.val_pointers, nullptr, &event); + event.wait(); + + if (verbosity > 2) { + std::ostringstream out; + out << "openclSolver::copy_wells_to_gpu(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } + } +} // don't copy rowpointers and colindices, they stay the same template @@ -663,12 +706,12 @@ bool openclSolverBackend::create_preconditioner() { template -void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { +void openclSolverBackend::solve_system(BdaResult &res) { Timer t; // actually solve try { - gpu_pbicgstab(wellContribs, res); + gpu_pbicgstab(res); } catch (const cl::Error& error) { std::ostringstream oss; oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n"; @@ -710,6 +753,7 @@ template SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { initialize(N_, nnz_, dim, vals, rows, cols); + initialize_wellContribs(wellContribs); if (analysis_done == false) { if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; @@ -720,6 +764,7 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } copy_system_to_gpu(); + copy_wells_to_gpu(wellContribs); } else { update_system(vals, b); if (!create_preconditioner()) { @@ -727,8 +772,7 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int } update_system_on_gpu(); } - initialize_wellContribs(wellContribs); - solve_system(wellContribs, res); + solve_system(res); return SolverStatus::BDA_SOLVER_SUCCESS; } diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index afde8d7f7..5ed76d9a7 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -64,12 +64,12 @@ private: cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm() - //unsigned int num_blocks, dim_, dim_wells, num_std_wells; - //unsigned int *h_val_pointers; - //int *h_Ccols, *h_Bcols; - //double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs; - //cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs; - //cl::Buffer d_Ccols, d_Bcols, d_val_pointers; + unsigned int num_blocks, dim_weqs, dim_wells, num_std_wells; + unsigned int *h_val_pointers; + int *h_Ccols, *h_Bcols; + double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs; + cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs; + cl::Buffer d_Ccols, d_Bcols, d_val_pointers; // shared pointers are also passed to other objects cl::Program program; @@ -84,12 +84,10 @@ private: std::shared_ptr > ILU_apply2_k; std::shared_ptr > add_well_contributions_k; - Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 - int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings + Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 + int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings std::unique_ptr > mat = nullptr; // original matrix - BlockedMatrix *rmat = nullptr; // reordered matrix, used for spmv - - + BlockedMatrix *rmat = nullptr; // reordered matrix, used for spmv /// Divide A by B, and round up: return (int)ceil(A/B) /// \param[in] A dividend @@ -136,12 +134,12 @@ private: /// \param[out] b output vector void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b); - //void add_well_contributions_w(cl::Buffer valsC, cl::Buffer valsD, cl::Buffer valsB, cl::Buffer colsC, cl::Buffer colsB, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers); + void stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers); /// Solve linear system using ilu0-bicgstab /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); + void gpu_pbicgstab(BdaResult& res); /// Initialize GPU and allocate memory /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks @@ -160,6 +158,8 @@ private: /// Copy linear system to GPU void copy_system_to_gpu(); + void copy_wells_to_gpu(WellContributions& wellContribs); + /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] b input vectors, contains N values @@ -179,7 +179,7 @@ private: /// Solve linear system /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void solve_system(WellContributions& wellContribs, BdaResult &res); + void solve_system(BdaResult &res); public: