diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 565d82ae0..100b4fc21 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -60,6 +60,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) endif() if(MPI_FOUND) diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index b8ec7941e..342d06352 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -40,33 +40,6 @@ WellContributions::WellContributions(std::string gpu_mode){ } } -void WellContributions::alloc() -{ - if (num_std_wells > 0) { -#if HAVE_CUDA - if(cuda_gpu){ - allocStandardWells(); - } -#endif - -#if HAVE_OPENCL - if(opencl_gpu){ - h_Cnnzs_ocl = new double[num_blocks * dim * dim_wells]; - h_Dnnzs_ocl = new double[num_std_wells * dim_wells * dim_wells]; - h_Bnnzs_ocl = new double[num_blocks * dim * dim_wells]; - h_Ccols_ocl = new int[num_blocks]; - h_Bcols_ocl = new int[num_blocks]; - val_pointers = new unsigned int[num_std_wells + 1]; - - allocated = true; - } -#endif - -#if !HAVE_CUDA && !HAVE_OPENCL - OPM_THROW(std::logic_error, "Error cannot allocate on GPU because neither CUDA nor OpenCL were found by cmake"); -#endif - } -} WellContributions::~WellContributions() { @@ -81,24 +54,6 @@ WellContributions::~WellContributions() freeCudaMemory(); // should come before 'delete[] h_x' } #endif - -#if HAVE_OPENCL - if (h_x_ocl) { - delete[] h_x_ocl; - delete[] h_y_ocl; - } - - if(opencl_gpu){ - if(num_std_wells > 0){ - delete[] h_Cnnzs_ocl; - delete[] h_Dnnzs_ocl; - delete[] h_Bnnzs_ocl; - delete[] h_Ccols_ocl; - delete[] h_Bcols_ocl; - delete[] val_pointers; - } - } -#endif } /* @@ -106,23 +61,20 @@ WellContributions::~WellContributions() void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) { // apply MultisegmentWells if (num_ms_wells > 0) { - // allocate pinned memory on host if not yet done - if (h_x_ocl == nullptr) { - h_x_ocl = new double[N]; - h_y_ocl = new double[N]; - } + h_x_ocl.reserve(N); + h_y_ocl.reserve(N); // copy vectors x and y from GPU to CPU - queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl); - queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl); + queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl.data()); + queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl.data()); // actually apply MultisegmentWells for (MultisegmentWellContribution *well : multisegments) { - well->apply(h_x_ocl, h_y_ocl); + well->apply(h_x_ocl.data(), h_y_ocl.data()); } // copy vector y from CPU to GPU - queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl); + queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl.data()); } } #endif @@ -130,12 +82,12 @@ void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) { void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size) { - if (!allocated) { - OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions"); - } #if HAVE_CUDA if(cuda_gpu){ + if (!allocated) { + OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions"); + } addMatrixGpu(type, colIndices, values, val_size); } #endif @@ -143,32 +95,29 @@ void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unus #if HAVE_OPENCL if(opencl_gpu){ switch (type) { - case MatrixType::C: - std::copy(colIndices, colIndices + val_size, h_Ccols_ocl + num_blocks_so_far); - std::copy(values, values + val_size*dim*dim_wells, h_Cnnzs_ocl + num_blocks_so_far*dim*dim_wells); - break; + case MatrixType::C: + h_Ccols_ocl.insert(h_Ccols_ocl.end(), colIndices, colIndices + val_size); + h_Cnnzs_ocl.insert(h_Cnnzs_ocl.end(), values, values + val_size * dim * dim_wells); + break; - case MatrixType::D: - std::copy(values, values + dim_wells*dim_wells, h_Dnnzs_ocl + num_std_wells_so_far*dim_wells*dim_wells); - break; + case MatrixType::D: + h_Dnnzs_ocl.insert(h_Dnnzs_ocl.end(), values, values + dim_wells * dim_wells); + break; - case MatrixType::B: - std::copy(colIndices, colIndices + val_size, h_Bcols_ocl + num_blocks_so_far); - std::copy(values, values + val_size*dim*dim_wells, h_Bnnzs_ocl + num_blocks_so_far*dim*dim_wells); - val_pointers[num_std_wells_so_far] = num_blocks_so_far; + case MatrixType::B: + h_Bcols_ocl.insert(h_Bcols_ocl.end(), colIndices, colIndices + val_size); + h_Bnnzs_ocl.insert(h_Bnnzs_ocl.end(), values, values + val_size * dim * dim_wells); - if(num_std_wells_so_far == num_std_wells - 1){ - val_pointers[num_std_wells] = num_blocks; - } - break; + if(h_val_pointers_ocl.empty()){ + h_val_pointers_ocl.push_back(0); + } + else{ + h_val_pointers_ocl.push_back(h_val_pointers_ocl.back() + val_size); + } + break; - default: - OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()"); - } - - if (MatrixType::B == type) { - num_blocks_so_far += val_size; - num_std_wells_so_far++; + default: + OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()"); } } @@ -186,6 +135,7 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_) dim_wells = dim_wells_; } +#if HAVE_CUDA void WellContributions::addNumBlocks(unsigned int numBlocks) { if (allocated) { @@ -195,6 +145,15 @@ void WellContributions::addNumBlocks(unsigned int numBlocks) num_std_wells++; } +void WellContributions::alloc() +{ + if (num_std_wells > 0) { + allocStandardWells(); + allocated = true; + } +} +#endif + void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_, unsigned int Nb, unsigned int Mb, unsigned int BnumBlocks, std::vector &Bvalues, std::vector &BcolIndices, std::vector &BrowPointers, diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu index bf9417a98..66e7abbc6 100644 --- a/opm/simulators/linalg/bda/WellContributions.cu +++ b/opm/simulators/linalg/bda/WellContributions.cu @@ -136,7 +136,6 @@ void WellContributions::allocStandardWells() val_pointers = new unsigned int[num_std_wells + 1]; cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1)); cudaCheckLastError("apply_gpu malloc failed"); - allocated = true; } } diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index 1467cbee7..6f8b856b9 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -55,7 +55,6 @@ namespace Opm /// - copy data of wellcontributions class WellContributions { - public: /// StandardWell has C, D and B matrices that need to be copied enum class MatrixType { @@ -64,28 +63,18 @@ public: B }; - unsigned int num_blocks = 0; // total number of blocks in all wells - unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq - unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq - unsigned int num_std_wells = 0; // number of StandardWells in this object - unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() - unsigned int num_blocks_so_far = 0; // keep track of where next data is written - unsigned int num_std_wells_so_far = 0; // keep track of where next data is written - unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols - #if HAVE_OPENCL - double *h_Cnnzs_ocl = nullptr; - double *h_Dnnzs_ocl = nullptr; - double *h_Bnnzs_ocl = nullptr; - int *h_Ccols_ocl = nullptr; - int *h_Bcols_ocl = nullptr; - double *h_x_ocl = nullptr; - double *h_y_ocl = nullptr; + std::vector h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl; + std::vector h_Ccols_ocl, h_Bcols_ocl; + std::vector h_val_pointers_ocl; + std::vector h_x_ocl, h_y_ocl; #endif private: + unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq + unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq + unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() unsigned int N; // number of rows (not blockrows) in vectors x and y - bool allocated = false; std::vector multisegments; int *toOrder = nullptr; @@ -95,6 +84,12 @@ private: bool cuda_gpu = false; #if HAVE_CUDA + bool allocated = false; + unsigned int num_blocks = 0; // total number of blocks in all wells + unsigned int num_std_wells = 0; // number of StandardWells in this object + unsigned int num_blocks_so_far = 0; // keep track of where next data is written + unsigned int num_std_wells_so_far = 0; // keep track of where next data is written + unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols cudaStream_t stream; // data for StandardWells, could remain nullptrs if not used @@ -142,6 +137,13 @@ public: unsigned int getNumWells(){ return num_std_wells + num_ms_wells; } + + /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called + /// \param[in] numBlocks number of blocks in C and B of next StandardWell + void addNumBlocks(unsigned int numBlocks); + + /// Allocate memory for the StandardWells + void alloc(); #endif /// Create a new WellContributions @@ -150,18 +152,11 @@ public: /// Destroy a WellContributions, and free memory ~WellContributions(); - - /// Allocate memory for the StandardWells - void alloc(); - /// Indicate how large the blocks of the StandardWell (C and B) are /// \param[in] dim number of columns /// \param[in] dim_wells number of rows void setBlockSize(unsigned int dim, unsigned int dim_wells); - /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called - /// \param[in] numBlocks number of blocks in C and B of next StandardWell - void addNumBlocks(unsigned int numBlocks); /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called /// \param[in] type indicate if C, D or B is sent diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp new file mode 100644 index 000000000..716d05e57 --- /dev/null +++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp @@ -0,0 +1,102 @@ +/* + Copyright 2019 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +#include +#include +#include + +#include + + +namespace bda +{ + using Opm::OpmLog; + using Dune::Timer; + + void WellContributionsOCLContainer::initBuffers(WellContributions &wellContribs) + { + dim = wellContribs.dim; + dim_wells = wellContribs.dim_wells; + num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1; + toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + wellContribs.h_Ccols_ocl.size()); + + s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size()); + s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size()); + s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size()); + s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size()); + s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size()); + s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size()); + s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * toOrder.size()); + } + + void WellcontributionsOCLContainer::copy_to_gpu(WellContributions &wellContribs){ + cl::Event event; + queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data()); + queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data()); + queue->enqueueWriteBuffer(s.Bnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data()); + queue->enqueueWriteBuffer(s.Ccols, CL_TRUE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data()); + queue->enqueueWriteBuffer(s.Bcols, CL_TRUE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data()); + queue->enqueueWriteBuffer(s.val_pointers, CL_TRUE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data()); + queue->enqueueWriteBuffer(s.toOrder, CL_TRUE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &event); + event.wait(); + } + + void WellcontributionsOCLContainer::update_on_gpu(WellContributions &wellContribs){ + cl::Event event; + queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data()); + queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data()); + queue->enqueueWriteBuffer(s.Bnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &event); + event.wait(); + } + + void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){ + this->context = context_; + } + + void WellContributionsOCLContainer::setOpenCLQueue(cl::CommandQueue *queue_){ + this->queue = queue_; + } + + void WellContributionsOCLContainer::setKernel(cl::make_kernel *stdwell_apply_){ + this->stdwell_apply = stdwell_apply_; + } + + void WellContributionsOCLContainer::applyStdWells(cl::Buffer& x, cl::Buffer& y){ + const unsigned int work_group_size = 32; + const unsigned int total_work_items = num_std_wells * work_group_size; + const unsigned int lmem1 = sizeof(double) * work_group_size; + const unsigned int lmem2 = sizeof(double) * dim_wells; + + cl::Event event; + event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, s.toOrder,x, y, dim, dim_wells, s.val_pointers, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + } + + void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){ + if(num_std_wells > 0){ + applyStdWells(x, y); + } + } +} // end namespace bda diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp new file mode 100644 index 000000000..70e85fb90 --- /dev/null +++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp @@ -0,0 +1,70 @@ +/* + Copyright 2019 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef WELLCONTRIBUTIONSOCLCONTAINER_HPP +#define WELLCONTRIBUTIONSOCLCONTAINER_HPP + +#include +#include + +namespace bda +{ + class WellContributionsOCLContainer + { + private: + unsigned int dim, dim_wells; + unsigned int num_std_wells = 0; + unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() + std::vector toOrder; + + typedef struct { + cl::Buffer Cnnzs, Dnnzs, Bnnzs; + cl::Buffer Ccols, Bcols; + cl::Buffer val_pointers, toOrder; + } GPU_storage; + + GPU_storage s; + cl::Context *context; + cl::CommandQueue *queue; + cl::make_kernel *stdwell_apply; + + void applyStdWells(cl::Buffer& x, cl::Buffer& y); + + public: + WellContributionsOCLContainer(); + ~WellContributionsOCLContainer(); + + void apply(cl::Buffer& x, cl::Buffer& y); + void initBuffers(WellContributions &wellContribs); + void copy_to_gpu(WellContributions &wellContribs); + void update_on_gpu(WellContributions &wellContribs); + void setOpenCLContext(cl::Context *context); + void setOpenCLQueue(cl::CommandQueue *queue); + void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group); + void setKernel(cl::make_kernel *stdwell_apply_); + }; +} // end namespace bda + +#endif diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index 933634a9f..d7364c9ec 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -366,96 +366,78 @@ namespace bda } )"; - inline const char* add_well_contributions_s = R"( - #pragma OPENCL EXTENSION cl_khr_fp64: enable - #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable - - void atomicAdd(volatile __global double *val, const double delta){ - union{ - double f; - ulong i; - } old; - - union{ - double f; - ulong i; - } new; - - do{ - old.f = *val; - new.f = old.f + delta; - } while(atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i); - } - - __kernel void add_well_contributions(__global const double *valsC, - __global const double *valsD, - __global const double *valsB, - __global const int *colsC, - __global const int *colsB, - __global const double *x, - __global double *y, - const unsigned int blnc, - const unsigned int blnr, - __global const unsigned int *rowptr, - __local double *localSum, - __local double *z1, - __local double *z2){ + inline const char* stdwell_apply_s = R"( + __kernel void stdwell_apply(__global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + __global const int *toOrder, + const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2){ int wgId = get_group_id(0); int wiId = get_local_id(0); - int valSize = rowptr[wgId + 1] - rowptr[wgId]; - int valsPerBlock = blnc*blnr; + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; int numActiveWorkItems = (32/valsPerBlock)*valsPerBlock; int numBlocksPerWarp = 32/valsPerBlock; - int c = wiId % blnc; - int r = (wiId/blnc) % blnr; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; double temp; + barrier(CLK_LOCAL_MEM_FENCE); + localSum[wiId] = 0; if(wiId < numActiveWorkItems){ - int b = wiId/valsPerBlock + rowptr[wgId]; - while(b < valSize + rowptr[wgId]){ - int colIdx = colsB[b]; - localSum[wiId] += valsB[b*blnc*blnr + r*blnc + c]*x[colIdx*blnc + c]; + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + int colIdx = toOrder[Bcols[b]]; + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; b += numBlocksPerWarp; } - } - barrier(CLK_LOCAL_MEM_FENCE); - - int stride = valsPerBlock; - if(wiId < stride){ - localSum[wiId] += localSum[wiId + stride]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(c == 0 && wiId < valsPerBlock){ - for(stride = 2; stride > 0; stride /= 2){ - localSum[wiId] += localSum[wiId + stride]; + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(wiId < valsPerBlock){ + if(c == 0 || c == 2) {localSum[wiId] += localSum[wiId + 2];} + if(c == 0 || c == 1) {localSum[wiId] += localSum[wiId + 1];} + } + + if(c == 0 && wiId < valsPerBlock){ + z1[r] = localSum[wiId]; } - z1[r] = localSum[wiId]; } barrier(CLK_LOCAL_MEM_FENCE); - if(wiId < blnr){ + if(wiId < dim_wells){ temp = 0.0; - for(unsigned int i = 0; i < blnr; ++i){ - temp += valsD[wgId*blnr*blnr + wiId*blnr + i]*z1[i]; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; } z2[wiId] = temp; } - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); - if(wiId < blnc*valSize){ + if(wiId < dim*valSize){ temp = 0.0; - int bb = wiId/blnc + rowptr[wgId]; - int colIdx = colsC[bb]; - for (unsigned int j = 0; j < blnr; ++j){ - temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j]; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; } - atomicAdd(&y[colIdx*blnc + c], -temp); + colIdx = toOrder[Ccols[bb]]; + y[colIdx*dim + c] -= temp; } } )"; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 749ddb905..50a549faf 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -51,6 +51,7 @@ using Dune::Timer; template openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_) { prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_); + wcontainer = new WContainer(); } @@ -180,20 +181,6 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer } } -template -void openclSolverBackend::stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers) -{ - const unsigned int work_group_size = 32; - const unsigned int total_work_items = num_std_wells * work_group_size; - const unsigned int lmem1 = sizeof(double) * work_group_size; - const unsigned int lmem2 = sizeof(double) * dim_wells; - - cl::Event event; - event = (*add_well_contributions_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - Cnnzs, Dnnzs, Bnnzs, Ccols, Bcols, x, y, dim_weqs, dim_wells, val_pointers, - cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); -} - template void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { float it; @@ -249,7 +236,7 @@ void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { // v = A * pw t_spmv.start(); - spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v); + wcontainer->apply(d_pw, d_v); t_spmv.stop(); // apply wellContributions @@ -283,7 +270,7 @@ void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { // apply wellContributions t_well.start(); - stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_s, d_t, d_val_pointers); + wcontainer->apply(d_s, d_t); t_well.stop(); t_rest.start(); @@ -331,7 +318,7 @@ void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { template -void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { +void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs) { this->N = N_; this->nnz = nnz_; this->nnzb = nnz_ / block_size / block_size; @@ -473,7 +460,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s))); source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s))); source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); - source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s))); + source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s))); program = cl::Program(*context, source); program.build(devices); @@ -483,6 +470,8 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub prec->setOpenCLContext(context.get()); prec->setOpenCLQueue(queue.get()); + wcontainer->setOpenCLContext(context.get()); + wcontainer->setOpenCLQueue(queue.get()); rb = new double[N]; tmp = new double[N]; @@ -507,6 +496,8 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); + wcontainer->initBuffers(wellContribs); + // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -518,9 +509,13 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub spmv_blocked_k.reset(new cl::make_kernel(cl::Kernel(program, "spmv_blocked"))); ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply1"))); ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply2"))); - add_well_contributions_k.reset(new cl::make_kernel(cl::Kernel(program, "add_well_contributions"))); + stdwell_apply_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply"))); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); + wcontainer->setKernel(stdwell_apply_k.get()); } catch (const cl::Error& error) { std::ostringstream oss; @@ -536,23 +531,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub initialized = true; } // end initialize() -template -void openclSolverBackend::initialize_wellContribs(WellContributions& wellContribs){ - dim_weqs = wellContribs.dim; - dim_wells = wellContribs.dim_wells; - num_blocks = wellContribs.num_blocks; - num_std_wells = wellContribs.num_std_wells; - - if(num_std_wells > 0){ - d_Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells); - d_Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells); - d_Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells); - d_Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); - d_Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); - d_val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1)); - } -} - template void openclSolverBackend::finalize() { delete[] rb; @@ -561,11 +539,12 @@ void openclSolverBackend::finalize() { delete[] vals_contiguous; #endif delete prec; + delete wcontainer; } // end finalize() template -void openclSolverBackend::copy_system_to_gpu() { +void openclSolverBackend::copy_system_to_gpu(WellContributions &wellContribs) { Timer t; cl::Event event; @@ -587,6 +566,8 @@ void openclSolverBackend::copy_system_to_gpu() { queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); + wcontainer->copy_to_gpu(wellContribs); + if (verbosity > 2) { std::ostringstream out; out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s"; @@ -594,33 +575,9 @@ void openclSolverBackend::copy_system_to_gpu() { } } // end copy_system_to_gpu() -template -void openclSolverBackend::copy_wells_to_gpu(WellContributions& wellContribs) { - wellContribs.setReordering(toOrder, true); - - if(num_std_wells > 0){ - Timer t; - cl::Event event; - - queue->enqueueWriteBuffer(d_Cnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Cnnzs_ocl); - queue->enqueueWriteBuffer(d_Dnnzs, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, wellContribs.h_Dnnzs_ocl); - queue->enqueueWriteBuffer(d_Bnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Bnnzs_ocl); - queue->enqueueWriteBuffer(d_Ccols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Ccols_ocl); - queue->enqueueWriteBuffer(d_Bcols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Bcols_ocl); - queue->enqueueWriteBuffer(d_val_pointers, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), wellContribs.val_pointers, nullptr, &event); - event.wait(); - - if (verbosity > 2) { - std::ostringstream out; - out << "openclSolver::copy_wells_to_gpu(): " << t.stop() << " s"; - OpmLog::info(out.str()); - } - } -} - // don't copy rowpointers and colindices, they stay the same template -void openclSolverBackend::update_system_on_gpu() { +void openclSolverBackend::update_system_on_gpu(WellContributions &wellContribs) { Timer t; cl::Event event; @@ -640,6 +597,8 @@ void openclSolverBackend::update_system_on_gpu() { queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); + wcontainer->update_on_gpu(wellContribs); + if (verbosity > 2) { std::ostringstream out; out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s"; @@ -752,8 +711,7 @@ void openclSolverBackend::get_result(double *x) { template SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { - initialize(N_, nnz_, dim, vals, rows, cols); - initialize_wellContribs(wellContribs); + initialize(N_, nnz_, dim, vals, rows, cols, wellContribs); if (analysis_done == false) { if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; @@ -763,14 +721,13 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } - copy_system_to_gpu(); - copy_wells_to_gpu(wellContribs); + copy_system_to_gpu(wellContribs); } else { update_system(vals, b); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } - update_system_on_gpu(); + update_system_on_gpu(wellContribs); } solve_system(res); return SolverStatus::BDA_SOLVER_SUCCESS; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 5ed76d9a7..ec103f725 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -21,11 +21,10 @@ #define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED #include - #include #include #include - +#include #include namespace bda @@ -35,9 +34,9 @@ namespace bda template class openclSolverBackend : public BdaSolver { - typedef BdaSolver Base; typedef BILU0 Preconditioner; + typedef WellContributionsOCLContainer WContainer; using Base::N; using Base::Nb; @@ -51,7 +50,6 @@ class openclSolverBackend : public BdaSolver using Base::initialized; private: - double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp @@ -64,13 +62,6 @@ private: cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm() - unsigned int num_blocks, dim_weqs, dim_wells, num_std_wells; - unsigned int *h_val_pointers; - int *h_Ccols, *h_Bcols; - double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs; - cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs; - cl::Buffer d_Ccols, d_Bcols, d_val_pointers; - // shared pointers are also passed to other objects cl::Program program; std::shared_ptr context; @@ -82,9 +73,13 @@ private: std::unique_ptr > spmv_blocked_k; std::shared_ptr > ILU_apply1_k; std::shared_ptr > ILU_apply2_k; - std::shared_ptr > add_well_contributions_k; + std::shared_ptr > stdwell_apply_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 + WContainer *wcontainer = nullptr; int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings std::unique_ptr > mat = nullptr; // original matrix BlockedMatrix *rmat = nullptr; // reordered matrix, used for spmv @@ -148,17 +143,13 @@ private: /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] cols array of columnIndices, contains nnz values - void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); - - void initialize_wellContribs(WellContributions& wellContribs); + void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs); /// Clean memory void finalize(); /// Copy linear system to GPU - void copy_system_to_gpu(); - - void copy_wells_to_gpu(WellContributions& wellContribs); + void copy_system_to_gpu(WellContributions &wellContribs); /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values @@ -166,7 +157,7 @@ private: void update_system(double *vals, double *b); /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same - void update_system_on_gpu(); + void update_system_on_gpu(WellContributions &wellContribs); /// Analyse sparsity pattern to extract parallelism /// \return true iff analysis was successful @@ -179,7 +170,7 @@ private: /// Solve linear system /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void solve_system(BdaResult &res); + void solve_system(WellContributions &wellContribs, BdaResult &res); public: diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp index d8df47787..20471f014 100644 --- a/opm/simulators/wells/BlackoilWellModel_impl.hpp +++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp @@ -916,6 +916,8 @@ namespace Opm { { // prepare for StandardWells wellContribs.setBlockSize(StandardWell::numEq, StandardWell::numStaticWellEq); + +#if HAVE_CUDA for(unsigned int i = 0; i < well_container_.size(); i++){ auto& well = well_container_[i]; std::shared_ptr > derived = std::dynamic_pointer_cast >(well); @@ -928,6 +930,7 @@ namespace Opm { // allocate memory for data from StandardWells wellContribs.alloc(); +#endif for(unsigned int i = 0; i < well_container_.size(); i++){ auto& well = well_container_[i];