From 56c1c0862cd77db6c89cd98c9d6220eb7468850d Mon Sep 17 00:00:00 2001 From: Jose Eduardo Bueno Date: Fri, 17 Jul 2020 17:00:37 -0300 Subject: [PATCH] Added StandardWell functionality to OpenCL backend --- opm/simulators/linalg/ISTLSolverEbos.hpp | 13 +- .../linalg/bda/WellContributions.cpp | 198 ++++++++++++++---- .../linalg/bda/WellContributions.hpp | 86 ++++---- opm/simulators/linalg/bda/openclKernels.hpp | 106 +++++++++- .../linalg/bda/openclSolverBackend.cpp | 46 ++-- .../linalg/bda/openclSolverBackend.hpp | 14 +- 6 files changed, 350 insertions(+), 113 deletions(-) diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index dc324e583..e05dbccec 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -181,7 +181,7 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M) #else const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); if (gpu_mode.compare("none") != 0) { - OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL was not found by cmake"); + OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake"); } #endif extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_); @@ -465,7 +465,8 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M) #if HAVE_CUDA || HAVE_OPENCL bool use_gpu = bdaBridge->getUseGpu(); if (use_gpu) { - WellContributions wellContribs; + const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); + WellContributions wellContribs(gpu_mode); if (!useWellConn_) { simulator_.problem().wellModel().getWellContributions(wellContribs); } @@ -478,7 +479,13 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M) // CPU fallback use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver if (use_gpu) { - OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system..."); + if(gpu_mode.compare("cusparse") == 0){ + OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system..."); + } + + if(gpu_mode.compare("opencl") == 0){ + OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system..."); + } } // call Dune diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index 1533003de..a46f54c3c 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -24,79 +24,202 @@ #include #include +#include #include "opm/simulators/linalg/bda/WellContributions.hpp" namespace Opm { +WellContributions::WellContributions(std::string gpu_mode){ + if(gpu_mode.compare("cusparse") == 0){ + cuda_gpu = true; + } + + if(gpu_mode.compare("opencl") == 0){ + opencl_gpu = true; + } +} void WellContributions::alloc() { if (num_std_wells > 0) { #if HAVE_CUDA - allocStandardWells(); -#else - OPM_THROW(std::logic_error, "Error cannot allocate on GPU for StandardWells because CUDA was not found by cmake"); + if(cuda_gpu){ + allocStandardWells(); + } +#endif + +#if HAVE_OPENCL + if(opencl_gpu){ + h_Cnnzs_ocl = new double[num_blocks * dim * dim_wells]; + h_Dnnzs_ocl = new double[num_std_wells * dim_wells * dim_wells]; + h_Bnnzs_ocl = new double[num_blocks * dim * dim_wells]; + h_Ccols_ocl = new int[num_blocks]; + h_Bcols_ocl = new int[num_blocks]; + val_pointers = new unsigned int[num_std_wells + 1]; + + allocated = true; + } +#endif + +#if !HAVE_CUDA && !HAVE_OPENCL + OPM_THROW(std::logic_error, "Error cannot allocate on GPU because neither CUDA nor OpenCL were found by cmake"); #endif } } WellContributions::~WellContributions() { -#if HAVE_CUDA - freeCudaMemory(); -#endif - if (h_x) { - delete[] h_x; - delete[] h_y; - } - // delete MultisegmentWellContributions for (auto ms : multisegments) { delete ms; } multisegments.clear(); -} +#if HAVE_CUDA + if(cuda_gpu){ + freeCudaMemory(); // should come before 'delete[] h_x' + } +#endif #if HAVE_OPENCL -void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) { + if (h_x_ocl) { + delete[] h_x_ocl; + delete[] h_y_ocl; + } + if(opencl_gpu){ + if(num_std_wells > 0){ + delete[] h_Cnnzs_ocl; + delete[] h_Dnnzs_ocl; + delete[] h_Bnnzs_ocl; + delete[] h_Ccols_ocl; + delete[] h_Bcols_ocl; + delete[] val_pointers; + } + } +#endif +} + +#if HAVE_OPENCL + +void WellContributions::init(cl::Context *context){ + d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells); + d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells); + d_Bnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells); + d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); + d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); + d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1)); +} + +void WellContributions::copyDataToGPU(cl::CommandQueue *queue){ + cl::Event event; + + queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl); + queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl); + queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl); + queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl); + queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl); + queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event); + event.wait(); +} + +void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) { // apply MultisegmentWells if (num_ms_wells > 0) { // allocate pinned memory on host if not yet done - if (h_x == nullptr) { - h_x = new double[N]; - h_y = new double[N]; + if (h_x_ocl == nullptr) { + h_x_ocl = new double[N]; + h_y_ocl = new double[N]; } // copy vectors x and y from GPU to CPU - queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x); - queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y); + queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl); + queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl); // actually apply MultisegmentWells for (MultisegmentWellContribution *well : multisegments) { - well->apply(h_x, h_y); + well->apply(h_x_ocl, h_y_ocl); } // copy vector y from CPU to GPU - queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y); - } - - - // apply StandardWells - if (num_std_wells > 0) { - OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver"); + queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl); } } + +void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){ + const unsigned int work_group_size = 32; + const unsigned int total_work_items = num_std_wells * work_group_size; + const unsigned int lmem1 = sizeof(double) * work_group_size; + const unsigned int lmem2 = sizeof(double) * dim_wells; + + cl::Event event; + event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, + d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + event.wait(); +} + +void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){ + if(num_std_wells > 0){ + applyStdWell(queue, d_x, d_y, kernel); + } + + if(num_ms_wells > 0){ + applyMSWell(queue, d_x, d_y); + } +} + #endif -void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]]int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size) +void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size) { + if (!allocated) { + OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions"); + } + #if HAVE_CUDA + if(cuda_gpu){ addMatrixGpu(type, colIndices, values, val_size); -#else - OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because CUDA was not found by cmake"); + } +#endif + +#if HAVE_OPENCL + if(opencl_gpu){ + switch (type) { + case MatrixType::C: + std::copy(colIndices, colIndices + val_size, h_Ccols_ocl + num_blocks_so_far); + std::copy(values, values + val_size*dim*dim_wells, h_Cnnzs_ocl + num_blocks_so_far*dim*dim_wells); + break; + + case MatrixType::D: + std::copy(values, values + dim_wells*dim_wells, h_Dnnzs_ocl + num_std_wells_so_far*dim_wells*dim_wells); + break; + + case MatrixType::B: + std::copy(colIndices, colIndices + val_size, h_Bcols_ocl + num_blocks_so_far); + std::copy(values, values + val_size*dim*dim_wells, h_Bnnzs_ocl + num_blocks_so_far*dim*dim_wells); + val_pointers[num_std_wells_so_far] = num_blocks_so_far; + + if(num_std_wells_so_far == num_std_wells - 1){ + val_pointers[num_std_wells] = num_blocks; + } + break; + + default: + OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()"); + } + + if (MatrixType::B == type) { + num_blocks_so_far += val_size; + num_std_wells_so_far++; + } + } + +#endif + +#if !HAVE_CUDA && !HAVE_OPENCL + OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake"); #endif } @@ -107,23 +230,23 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_) dim_wells = dim_wells_; } -void WellContributions::addNumBlocks(unsigned int nnz) +void WellContributions::addNumBlocks(unsigned int numBlocks) { if (allocated) { OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions"); } - num_blocks += nnz; + num_blocks += numBlocks; num_std_wells++; } -void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_, +void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells, unsigned int Nb, unsigned int Mb, unsigned int BnumBlocks, std::vector &Bvalues, std::vector &BcolIndices, std::vector &BrowPointers, unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices, std::vector &Cvalues) { - this->N = Nb * dim_; - MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells_, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues); + this->N = Nb * dim; + MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues); multisegments.emplace_back(well); ++num_ms_wells; } @@ -138,12 +261,5 @@ void WellContributions::setReordering(int *toOrder_, bool reorder_) } } -#if HAVE_OPENCL -void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_) -{ - this->queue = queue_; -} -#endif - } //namespace Opm diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index cd8b9064e..a8c6b1ed7 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -57,7 +57,6 @@ class WellContributions { public: - /// StandardWell has C, D and B matrices that need to be copied enum class MatrixType { C, @@ -78,17 +77,16 @@ private: bool allocated = false; std::vector multisegments; + int *toOrder = nullptr; + bool reorder = false; + + bool opencl_gpu = false; + bool cuda_gpu = false; + #if HAVE_CUDA cudaStream_t stream; -#endif -#if HAVE_OPENCL - cl::CommandQueue *queue = nullptr; -#endif - -#if HAVE_CUDA // data for StandardWells, could remain nullptrs if not used - // StandardWells are only supported for cusparseSolver now double *d_Cnnzs = nullptr; double *d_Dnnzs = nullptr; double *d_Bnnzs = nullptr; @@ -97,13 +95,29 @@ private: double *d_z1 = nullptr; double *d_z2 = nullptr; unsigned int *d_val_pointers = nullptr; + double *h_x = nullptr; + double *h_y = nullptr; #endif - double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy +#if HAVE_OPENCL + double *h_Cnnzs_ocl = nullptr; + double *h_Dnnzs_ocl = nullptr; + double *h_Bnnzs_ocl = nullptr; + int *h_Ccols_ocl = nullptr; + int *h_Bcols_ocl = nullptr; + double *h_x_ocl = nullptr; + double *h_y_ocl = nullptr; - int *toOrder = nullptr; - bool reorder = false; + cl::Buffer d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; + cl::Buffer d_Ccols_ocl, d_Bcols_ocl, d_val_pointers_ocl; + typedef cl::make_kernel kernel_type; +#endif + +#if HAVE_CUDA /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called /// \param[in] type indicate if C, D or B is sent /// \param[in] colIndices columnindices of blocks in C or B, ignored for D @@ -116,32 +130,43 @@ private: /// Free GPU memory allocated with cuda. void freeCudaMemory(); - -public: - - /// Set a cudaStream to be used - /// \param[in] stream the cudaStream that is used to launch the kernel in -#if HAVE_CUDA - void setCudaStream(cudaStream_t stream); #endif - /// Create a new WellContributions, implementation is empty - WellContributions() {}; +#if HAVE_OPENCL + void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel); + void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y); +#endif - /// Destroy a WellContributions, and free memory - ~WellContributions(); +public: +#if HAVE_CUDA + /// Set a cudaStream to be used + /// \param[in] stream the cudaStream that is used to launch the kernel in + void setCudaStream(cudaStream_t stream); /// Apply all Wells in this object /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells /// \param[in] d_x vector x, must be on GPU /// \param[inout] d_y vector y, must be on GPU -#if HAVE_CUDA void apply(double *d_x, double *d_y); + + unsigned int getNumWells(){ + return num_std_wells + num_ms_wells; + } #endif + #if HAVE_OPENCL - void apply(cl::Buffer& x, cl::Buffer& y); + void init(cl::Context *context); + void copyDataToGPU(cl::CommandQueue *queue); + void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel); #endif + /// Create a new WellContributions + WellContributions(std::string gpu_mode); + + /// Destroy a WellContributions, and free memory + ~WellContributions(); + + /// Allocate memory for the StandardWells void alloc(); @@ -182,24 +207,11 @@ public: unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices, std::vector &Cvalues); - /// Return the number of wells added to this object - /// \return the number of wells added to this object - unsigned int getNumWells() { - return num_std_wells + num_ms_wells; - } - - /// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect /// Those indices need to be mapped via toOrder /// \param[in] toOrder array with mappings /// \param[in] reorder whether the columnindices need to be reordered or not void setReordering(int *toOrder, bool reorder); - -#if HAVE_OPENCL - /// This object copies some cl::Buffers, it requires a CommandQueue to do that - /// \param[in] queue the opencl commandqueue to be used - void setOpenCLQueue(cl::CommandQueue *queue); -#endif }; } //namespace Opm diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index 17d8598f7..30dbfcd63 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -23,7 +23,7 @@ namespace bda { - const char* axpy_s = R"( + inline const char* axpy_s = R"( __kernel void axpy( __global double *in, const double a, @@ -42,7 +42,7 @@ namespace bda // returns partial sums, instead of the final dot product - const char* dot_1_s = R"( + inline const char* dot_1_s = R"( __kernel void dot_1( __global double *in1, __global double *in2, @@ -83,7 +83,7 @@ namespace bda // returns partial sums, instead of the final norm // the square root must be computed on CPU - const char* norm_s = R"( + inline const char* norm_s = R"( __kernel void norm( __global double *in, __global double *out, @@ -122,7 +122,7 @@ namespace bda // p = (p - omega * v) * beta + r - const char* custom_s = R"( + inline const char* custom_s = R"( __kernel void custom( __global double *p, __global double *v, @@ -150,7 +150,7 @@ namespace bda // algorithm based on: // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 - const char* spmv_blocked_s = R"( + inline const char* spmv_blocked_s = R"( __kernel void spmv_blocked( __global const double *vals, __global const int *cols, @@ -220,7 +220,7 @@ namespace bda // ILU apply part 1: forward substitution // solves L*x=y where L is a lower triangular sparse blocked matrix - const char* ILU_apply1_s = R"( + inline const char* ILU_apply1_s = R"( __kernel void ILU_apply1( __global const double *Lvals, __global const unsigned int *Lcols, @@ -290,7 +290,7 @@ namespace bda // ILU apply part 2: backward substitution // solves U*x=y where L is a lower triangular sparse blocked matrix - const char* ILU_apply2_s = R"( + inline const char* ILU_apply2_s = R"( __kernel void ILU_apply2( __global const double *Uvals, __global const int *Ucols, @@ -366,7 +366,99 @@ namespace bda } )"; + inline const char* add_well_contributions_s = R"( + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable + void atomicAdd(volatile __global double *val, const double delta){ + union{ + double f; + ulong i; + } old; + + union{ + double f; + ulong i; + } new; + + do{ + old.f = *val; + new.f = old.f + delta; + } while(atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i); + } + + __kernel void add_well_contributions(__global const double *valsC, + __global const double *valsD, + __global const double *valsB, + __global const int *colsC, + __global const int *colsB, + __global const double *x, + __global double *y, + const unsigned int blnc, + const unsigned int blnr, + __global const unsigned int *rowptr, + __local double *localSum, + __local double *z1, + __local double *z2){ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = rowptr[wgId + 1] - rowptr[wgId]; + int valsPerBlock = blnc*blnr; + int numActiveWorkItems = (32/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = 32/valsPerBlock; + int c = wiId % blnc; + int r = (wiId/blnc) % blnr; + double temp; + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + rowptr[wgId]; + while(b < valSize + rowptr[wgId]){ + int colIdx = colsB[b]; + localSum[wiId] += valsB[b*blnc*blnr + r*blnc + c]*x[colIdx*blnc + c]; + b += numBlocksPerWarp; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int stride = valsPerBlock; + if(wiId < stride){ + localSum[wiId] += localSum[wiId + stride]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(c == 0 && wiId < valsPerBlock){ + for(stride = 2; stride > 0; stride /= 2){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < blnr){ + temp = 0.0; + for(unsigned int i = 0; i < blnr; ++i){ + temp += valsD[wgId*blnr*blnr + wiId*blnr + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + if(wiId < blnc*valSize){ + temp = 0.0; + int bb = wiId/blnc + rowptr[wgId]; + int colIdx = colsC[bb]; + for (unsigned int j = 0; j < blnr; ++j){ + temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j]; + } + atomicAdd(&y[colIdx*blnc + c], temp); + } + } + )"; } // end namespace bda diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 5bbf4bbe8..8124d9885 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -70,7 +71,7 @@ unsigned int openclSolverBackend::ceilDivision(const unsigned int A, template double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) { - const unsigned int work_group_size = 1024; + const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; @@ -98,7 +99,7 @@ double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl template double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) { - const unsigned int work_group_size = 1024; + const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; @@ -179,19 +180,14 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer } } - template void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { - float it; double rho, rhop, beta, alpha, omega, tmp1, tmp2; double norm, norm_0; Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); - wellContribs.setOpenCLQueue(queue.get()); - wellContribs.setReordering(toOrder, true); - // set r to the initial residual // if initial x guess is not 0, must call applyblockedscaleadd(), not implemented //applyblockedscaleadd(-1.0, mat, x, r); @@ -212,6 +208,8 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); + wellContribs.setReordering(toOrder, true); + norm = norm_w(d_r, d_tmp); norm_0 = norm; @@ -243,11 +241,9 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_spmv.stop(); // apply wellContributions - if (wellContribs.getNumWells() > 0) { - t_well.start(); - wellContribs.apply(d_pw, d_v); - t_well.stop(); - } + t_well.start(); + wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get()); + t_well.stop(); t_rest.start(); tmp1 = dot_w(d_rw, d_v, d_tmp); @@ -274,11 +270,9 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_spmv.stop(); // apply wellContributions - if (wellContribs.getNumWells() > 0) { - t_well.start(); - wellContribs.apply(d_s, d_t); - t_well.stop(); - } + t_well.start(); + wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get()); + t_well.stop(); t_rest.start(); tmp1 = dot_w(d_t, d_r, d_tmp); @@ -314,7 +308,8 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr } if (verbosity >= 4) { std::ostringstream out; - out << "openclSolver::ily_apply: " << t_prec.elapsed() << " s\n"; + out << "openclSolver::ilu_apply: " << t_prec.elapsed() << " s\n"; + out << "wellContributions::apply: " << t_well.elapsed() << " s\n"; out << "openclSolver::spmv: " << t_spmv.elapsed() << " s\n"; out << "openclSolver::rest: " << t_rest.elapsed() << " s\n"; out << "openclSolver::total_solve: " << res.elapsed << " s\n"; @@ -324,7 +319,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr template -void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { +void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs) { this->N = N_; this->nnz = nnz_; this->nnzb = nnz_ / block_size / block_size; @@ -466,6 +461,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s))); source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s))); source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); + source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s))); cl::Program program_ = cl::Program(*context, source); program_.build(devices); @@ -481,7 +477,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub #if COPY_ROW_BY_ROW vals_contiguous = new double[N]; #endif - mat.reset(new BlockedMatrix(Nb, nnzb, vals, cols, rows)); d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); @@ -500,6 +495,8 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); + wellContribs.init(context.get()); + // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -511,6 +508,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub spmv_blocked_k.reset(new cl::make_kernel(cl::Kernel(program_, "spmv_blocked"))); ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program_, "ILU_apply1"))); ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program_, "ILU_apply2"))); + add_well_contributions_k.reset(new cl::make_kernel(cl::Kernel(program_, "add_well_contributions"))); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); @@ -541,7 +539,7 @@ void openclSolverBackend::finalize() { template -void openclSolverBackend::copy_system_to_gpu() { +void openclSolverBackend::copy_system_to_gpu(WellContributions& wellContribs) { Timer t; cl::Event event; @@ -563,6 +561,8 @@ void openclSolverBackend::copy_system_to_gpu() { queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); + wellContribs.copyDataToGPU(queue.get()); + if (verbosity > 2) { std::ostringstream out; out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s"; @@ -706,7 +706,7 @@ void openclSolverBackend::get_result(double *x) { template SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { - initialize(N_, nnz_, dim, vals, rows, cols); + initialize(N_, nnz_, dim, vals, rows, cols, wellContribs); if (analysis_done == false) { if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; @@ -716,7 +716,7 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } - copy_system_to_gpu(); + copy_system_to_gpu(wellContribs); } else { update_system(vals, b); if (!create_preconditioner()) { diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 011f25d13..0cbbf937e 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -64,6 +64,13 @@ private: cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm() + //unsigned int num_blocks, dim_, dim_wells, num_std_wells; + //unsigned int *h_val_pointers; + //int *h_Ccols, *h_Bcols; + //double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs; + //cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs; + //cl::Buffer d_Ccols, d_Bcols, d_val_pointers; + // shared pointers are also passed to other objects std::shared_ptr context; std::shared_ptr queue; @@ -74,6 +81,7 @@ private: std::unique_ptr > spmv_blocked_k; std::shared_ptr > ILU_apply1_k; std::shared_ptr > ILU_apply2_k; + std::shared_ptr > add_well_contributions_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings @@ -127,6 +135,8 @@ private: /// \param[out] b output vector void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b); + //void add_well_contributions_w(cl::Buffer valsC, cl::Buffer valsD, cl::Buffer valsB, cl::Buffer colsC, cl::Buffer colsB, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers); + /// Solve linear system using ilu0-bicgstab /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result @@ -139,13 +149,13 @@ private: /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] cols array of columnIndices, contains nnz values - void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); + void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs); /// Clean memory void finalize(); /// Copy linear system to GPU - void copy_system_to_gpu(); + void copy_system_to_gpu(WellContributions& wellContribs); /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values