From a9c2a684c6c1af339a4d41dbb30af0037e670da1 Mon Sep 17 00:00:00 2001 From: Jose Eduardo Bueno Date: Wed, 11 Nov 2020 14:34:10 -0300 Subject: [PATCH 1/2] Removed WellContributionsOCLContainer class (rebase) --- CMakeLists_files.cmake | 1 - opm/simulators/linalg/ISTLSolverEbos.hpp | 7 +- opm/simulators/linalg/bda/BdaBridge.cpp | 2 +- opm/simulators/linalg/bda/BdaBridge.hpp | 7 +- opm/simulators/linalg/bda/BdaSolver.hpp | 1 - .../linalg/bda/WellContributions.cpp | 173 ++++++++-- .../linalg/bda/WellContributions.cu | 32 +- .../linalg/bda/WellContributions.hpp | 69 ++-- .../bda/WellContributionsOCLContainer.cpp | 176 ---------- .../bda/WellContributionsOCLContainer.hpp | 78 ----- .../linalg/bda/openclSolverBackend.cpp | 305 +++++++++--------- .../linalg/bda/openclSolverBackend.hpp | 24 +- .../wells/BlackoilWellModel_impl.hpp | 2 - 13 files changed, 382 insertions(+), 495 deletions(-) delete mode 100644 opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp delete mode 100644 opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index a5c7ed878..852bbe082 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -61,7 +61,6 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) - list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) endif() if(MPI_FOUND) diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index 0cefb69eb..d86c613da 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -251,6 +251,12 @@ namespace Opm if (use_gpu) { const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); WellContributions wellContribs(gpu_mode); +#if HAVE_OPENCL + if(gpu_mode.compare("opencl") == 0){ + const auto openclBackend = static_cast*>(&bdaBridge->getBackend()); + wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get()); + } +#endif if (!useWellConn_) { simulator_.problem().wellModel().getWellContributions(wellContribs); } @@ -301,7 +307,6 @@ namespace Opm const std::any& parallelInformation() const { return parallelInformation_; } protected: - // 3x3 matrix block inversion was unstable at least 2.3 until and including // 2.5.0. There may still be some issue with the 4x4 matrix block inversion // we therefore still use the block inversion in OPM diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 39051dcb0..71e797450 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -222,7 +222,7 @@ void BdaBridge::get_result(BridgeVector } } -#define INSTANTIATE_BDA_FUNCTIONS(n) \ +#define INSTANTIATE_BDA_FUNCTIONS(n) \ template BdaBridge, std::allocator > >, \ Dune::BlockVector, std::allocator > >, \ n>::BdaBridge \ diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index 18e623770..962acc9b4 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -47,8 +47,8 @@ template class BdaBridge { private: - std::unique_ptr > backend; bool use_gpu = false; + std::unique_ptr > backend; public: /// Construct a BdaBridge @@ -61,7 +61,6 @@ public: /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); - /// Solve linear system, A*x = b /// \warning Values of A might get overwritten! /// \param[in] mat matrix A, should be of type Dune::BCRSMatrix @@ -80,6 +79,10 @@ public: return use_gpu; } + const bda::BdaSolver& getBackend() const { + return *backend; + } + }; // end class BdaBridge } diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index f9b0dcb7b..cef79b5e1 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -66,7 +66,6 @@ namespace bda bool initialized = false; public: - /// Construct a BdaSolver, can be cusparseSolver or openclSolver /// \param[in] linear_solver_verbosity verbosity of solver /// \param[in] maxit maximum number of iterations for solver diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index e9134354b..3eca058af 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -17,15 +17,13 @@ along with OPM. If not, see . */ - #include // CMake #include #include - #include #include #include -#include "opm/simulators/linalg/bda/WellContributions.hpp" +#include namespace Opm { @@ -34,75 +32,180 @@ WellContributions::WellContributions(std::string gpu_mode){ if(gpu_mode.compare("cusparse") == 0){ cuda_gpu = true; } - - if(gpu_mode.compare("opencl") == 0){ + else if(gpu_mode.compare("opencl") == 0){ opencl_gpu = true; } + else{ + OPM_THROW(std::logic_error, "Error: invalid GPU mode"); + } } - WellContributions::~WellContributions() { -#if HAVE_CUDA // delete MultisegmentWellContributions - for (auto ms : multisegments) { + for (auto ms: multisegments) { delete ms; } multisegments.clear(); +#if HAVE_CUDA if(cuda_gpu){ freeCudaMemory(); // should come before 'delete[] h_x' } #endif + + if(num_std_wells > 0){ + delete[] val_pointers; +#if HAVE_OPENCL + if(opencl_gpu){ + delete[] h_toOrder; + } +#endif + } + +#if HAVE_OPENCL + if(opencl_gpu){ + if(num_ms_wells > 0){ + delete[] h_x; + delete[] h_y; + } + } +#endif } +#if HAVE_OPENCL +void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){ + this->context = context_; + this->queue = queue_; +} + +void WellContributions::setKernel(kernel_type *kernel_){ + this->kernel = kernel_; +} + +void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ + const unsigned int work_group_size = 32; + const unsigned int total_work_items = num_std_wells * work_group_size; + const unsigned int lmem1 = sizeof(double) * work_group_size; + const unsigned int lmem2 = sizeof(double) * dim_wells; + + cl::Event event; + event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); +} + +void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ + if(h_x == nullptr){ + h_x = new double[N]; + h_y = new double[N]; + } + + if(h_toOrder == nullptr){ + h_toOrder = new int[Nb]; + } + + if(!read_toOrder){ + events.resize(1); + queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]); + events[0].wait(); + events.clear(); + read_toOrder = true; + } + + events.resize(2); + queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]); + queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]); + cl::WaitForEvents(events); + events.clear(); + + // actually apply MultisegmentWells + for(Opm::MultisegmentWellContribution *well: multisegments){ + well->setReordering(h_toOrder, true); + well->apply(h_x, h_y); + } + + // copy vector y from CPU to GPU + events.resize(1); + queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]); + events[0].wait(); + events.clear(); +} + +void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ + if(num_std_wells > 0){ + apply_stdwells(d_x, d_y, d_toOrder); + } + + if(num_ms_wells > 0){ + apply_mswells(d_x, d_y, d_toOrder); + } +} +#endif + void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size) { + if (!allocated) { + OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions"); + } #if HAVE_CUDA if(cuda_gpu){ - if (!allocated) { - OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions"); - } addMatrixGpu(type, colIndices, values, val_size); } #endif #if HAVE_OPENCL if(opencl_gpu){ - if(h_val_pointers_ocl.empty()){ - h_val_pointers_ocl.push_back(0); - } - switch (type) { case MatrixType::C: - h_Ccols_ocl.insert(h_Ccols_ocl.end(), colIndices, colIndices + val_size); - h_Cnnzs_ocl.insert(h_Cnnzs_ocl.end(), values, values + val_size * dim * dim_wells); + events.resize(2); + queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]); + queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]); + cl::WaitForEvents(events); + events.clear(); break; case MatrixType::D: - h_Dnnzs_ocl.insert(h_Dnnzs_ocl.end(), values, values + dim_wells * dim_wells); + events.resize(1); + queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]); + events[0].wait(); + events.clear(); break; case MatrixType::B: - h_Bcols_ocl.insert(h_Bcols_ocl.end(), colIndices, colIndices + val_size); - h_Bnnzs_ocl.insert(h_Bnnzs_ocl.end(), values, values + val_size * dim * dim_wells); - h_val_pointers_ocl.push_back(h_val_pointers_ocl.back() + val_size); + events.resize(2); + queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]); + queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]); + cl::WaitForEvents(events); + events.clear(); + + val_pointers[num_std_wells_so_far] = num_blocks_so_far; + if (num_std_wells_so_far == num_std_wells - 1) { + val_pointers[num_std_wells] = num_blocks; + events.resize(1); + queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &events[0]); + events[0].wait(); + events.clear(); + } break; default: OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()"); } } - #endif + if(MatrixType::B == type) { + num_blocks_so_far += val_size; + num_std_wells_so_far++; + } + #if !HAVE_CUDA && !HAVE_OPENCL OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake"); #endif } - void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_) { dim = dim_; @@ -115,7 +218,6 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_) } } -#if HAVE_CUDA void WellContributions::addNumBlocks(unsigned int numBlocks) { if (allocated) { @@ -128,19 +230,36 @@ void WellContributions::addNumBlocks(unsigned int numBlocks) void WellContributions::alloc() { if (num_std_wells > 0) { - allocStandardWells(); + val_pointers = new unsigned int[num_std_wells + 1]; + +#if HAVE_CUDA + if(cuda_gpu){ + allocStandardWells(); + } +#endif + +#if HAVE_OPENCL + if(opencl_gpu){ + d_Cnnzs_ocl = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells); + d_Dnnzs_ocl = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells); + d_Bnnzs_ocl = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells); + d_Ccols_ocl = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); + d_Bcols_ocl = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks); + d_val_pointers_ocl = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1)); + } +#endif allocated = true; } } -#endif void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_, - unsigned int Nb, unsigned int Mb, + unsigned int Nb_, unsigned int Mb, unsigned int BnumBlocks, std::vector &Bvalues, std::vector &BcolIndices, std::vector &BrowPointers, unsigned int DnumBlocks, double *Dvalues, UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices, std::vector &Cvalues) { assert(dim==dim_); + this->Nb = Nb_; this->N = Nb * dim_; MultisegmentWellContribution *well = new MultisegmentWellContribution(dim_, dim_wells_, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues); multisegments.emplace_back(well); diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu index 66e7abbc6..a59ae3ac9 100644 --- a/opm/simulators/linalg/bda/WellContributions.cu +++ b/opm/simulators/linalg/bda/WellContributions.cu @@ -127,16 +127,13 @@ __global__ void apply_well_contributions( void WellContributions::allocStandardWells() { - if (num_std_wells > 0) { - cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells); - cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells); - cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells); - cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks); - cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks); - val_pointers = new unsigned int[num_std_wells + 1]; - cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1)); - cudaCheckLastError("apply_gpu malloc failed"); - } + cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells); + cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells); + cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells); + cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks); + cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks); + cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1)); + cudaCheckLastError("apply_gpu malloc failed"); } void WellContributions::freeCudaMemory() { @@ -147,14 +144,13 @@ void WellContributions::freeCudaMemory() { cudaFree(d_Bnnzs); cudaFree(d_Ccols); cudaFree(d_Bcols); - delete[] val_pointers; cudaFree(d_val_pointers); } if (num_ms_wells > 0 && h_x) { cudaFreeHost(h_x); - cudaFreeHost(h_y); - h_x = h_y = nullptr; // Mark as free for constructor + cudaFreeHost(h_y); + h_x = h_y = nullptr; // Mark as free for constructor } } @@ -200,10 +196,6 @@ void WellContributions::apply(double *d_x, double *d_y) void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size) { - if (!allocated) { - OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions"); - } - switch (type) { case MatrixType::C: cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice); @@ -218,17 +210,13 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v val_pointers[num_std_wells_so_far] = num_blocks_so_far; if (num_std_wells_so_far == num_std_wells - 1) { val_pointers[num_std_wells] = num_blocks; + cudaMemcpy(d_val_pointers, val_pointers, sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice); } - cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells + 1), cudaMemcpyHostToDevice); break; default: OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()"); } cudaCheckLastError("WellContributions::addMatrix() failed"); - if (MatrixType::B == type) { - num_blocks_so_far += val_size; - num_std_wells_so_far++; - } } void WellContributions::setCudaStream(cudaStream_t stream_) diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index ac001e01b..7267fe1f0 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -71,30 +71,45 @@ public: B }; - unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq - unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq - std::vector multisegments; - -#if HAVE_OPENCL - std::vector h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl; - std::vector h_Ccols_ocl, h_Bcols_ocl; - std::vector h_val_pointers_ocl; -#endif - private: bool opencl_gpu = false; bool cuda_gpu = false; + bool allocated = false; unsigned int N; // number of rows (not blockrows) in vectors x and y - unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() - -#if HAVE_CUDA - bool allocated = false; + unsigned int Nb; // number of blockrows in vectors x and y + unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq + unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq unsigned int num_blocks = 0; // total number of blocks in all wells unsigned int num_std_wells = 0; // number of StandardWells in this object + unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() unsigned int num_blocks_so_far = 0; // keep track of where next data is written unsigned int num_std_wells_so_far = 0; // keep track of where next data is written unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols + + double *h_x = nullptr; + double *h_y = nullptr; + std::vector multisegments; + +#if HAVE_OPENCL + typedef cl::make_kernel kernel_type; + + cl::Context *context; + cl::CommandQueue *queue; + kernel_type *kernel; + std::vector events; + + std::unique_ptr d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; + std::unique_ptr d_Ccols_ocl, d_Bcols_ocl; + std::unique_ptr d_val_pointers_ocl; + + bool read_toOrder = false; + int *h_toOrder = nullptr; +#endif + +#if HAVE_CUDA cudaStream_t stream; // data for StandardWells, could remain nullptrs if not used @@ -106,8 +121,12 @@ private: double *d_z1 = nullptr; double *d_z2 = nullptr; unsigned int *d_val_pointers = nullptr; - double *h_x = nullptr; - double *h_y = nullptr; + + /// Allocate GPU memory for StandardWells + void allocStandardWells(); + + /// Free GPU memory allocated with cuda. + void freeCudaMemory(); /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called /// \param[in] type indicate if C, D or B is sent @@ -115,12 +134,6 @@ private: /// \param[in] values array of nonzeroes /// \param[in] val_size number of blocks in C or B, ignored for D void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size); - - /// Allocate GPU memory for StandardWells - void allocStandardWells(); - - /// Free GPU memory allocated with cuda. - void freeCudaMemory(); #endif public: @@ -134,6 +147,15 @@ public: /// \param[in] d_x vector x, must be on GPU /// \param[inout] d_y vector y, must be on GPU void apply(double *d_x, double *d_y); +#endif + +#if HAVE_OPENCL + void setKernel(kernel_type *kernel_); + void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_); + void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); + void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); + void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); +#endif unsigned int getNumWells(){ return num_std_wells + num_ms_wells; @@ -145,7 +167,6 @@ public: /// Allocate memory for the StandardWells void alloc(); -#endif /// Create a new WellContributions WellContributions(std::string gpu_mode); @@ -158,7 +179,6 @@ public: /// \param[in] dim_wells number of rows void setBlockSize(unsigned int dim, unsigned int dim_wells); - /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called /// \param[in] type indicate if C, D or B is sent /// \param[in] colIndices columnindices of blocks in C or B, ignored for D @@ -188,7 +208,6 @@ public: UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices, std::vector &Cvalues); }; - } //namespace Opm #endif diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp deleted file mode 100644 index da8386528..000000000 --- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* - Copyright 2019 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#include - -#include -#include -#include - -#include - -namespace bda -{ - using Opm::OpmLog; - using Dune::Timer; - - void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int N_, int Nb_){ - N = N_; - Nb = Nb_; - dim = wellContribs.dim; - dim_wells = wellContribs.dim_wells; - - if(!wellContribs.h_val_pointers_ocl.empty()){ - num_blocks = wellContribs.h_Ccols_ocl.size(); - num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1; - - s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size()); - s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size()); - s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size()); - s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size()); - s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size()); - s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size()); - s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); - } - } - - void WellContributionsOCLContainer::reinit(Opm::WellContributions &wellContribs){ - num_blocks = wellContribs.h_Ccols_ocl.size(); - num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1; - - s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size()); - s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size()); - s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size()); - s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size()); - s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size()); - s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size()); - } - - void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_){ - toOrder.insert(toOrder.end(), toOrder_, toOrder_ + Nb); - - if(num_std_wells > 0){ - cl::Event event; - std::vector events(7); - queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]); - queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]); - queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]); - queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]); - queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]); - queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]); - queue->enqueueWriteBuffer(s.toOrder, CL_FALSE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &events[6]); - event.waitForEvents(events); - } - - if(!wellContribs.multisegments.empty()){ - multisegments = std::move(wellContribs.multisegments); - num_ms_wells = multisegments.size(); - x_msw.reserve(N); - y_msw.reserve(N); - } - } - - void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){ - if(num_std_wells > 0){ - if(num_std_wells != wellContribs.h_val_pointers_ocl.size() || num_blocks != wellContribs.h_Ccols_ocl.size()){ - reinit(wellContribs); - } - - cl::Event event; - std::vector events(6); - queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]); - queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]); - queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]); - queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]); - queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]); - queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]); - event.waitForEvents(events); - } - - if(!wellContribs.multisegments.empty()){ - multisegments = std::move(wellContribs.multisegments); - } - } - - void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){ - this->context = context_; - } - - void WellContributionsOCLContainer::setOpenCLQueue(cl::CommandQueue *queue_){ - this->queue = queue_; - } - - void WellContributionsOCLContainer::setKernel(cl::make_kernel *stdwell_apply_){ - this->stdwell_apply = stdwell_apply_; - } - - void WellContributionsOCLContainer::applyStdWells(cl::Buffer& x, cl::Buffer& y){ - const unsigned int work_group_size = 32; - const unsigned int total_work_items = num_std_wells * work_group_size; - const unsigned int lmem1 = sizeof(double) * work_group_size; - const unsigned int lmem2 = sizeof(double) * dim_wells; - - cl::Event event; - event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, x, y, s.toOrder, dim, dim_wells, s.val_pointers, - cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); - } - - - void WellContributionsOCLContainer::applyMSWells(cl::Buffer& x, cl::Buffer& y) { - cl::Event event; - std::vector events(2); - - // copy vectors x and y from GPU to CPU - queue->enqueueReadBuffer(x, CL_FALSE, 0, sizeof(double) * N, x_msw.data(), nullptr, &events[0]); - queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &events[1]); - event.waitForEvents(events); - - // actually apply MultisegmentWells - for(auto well: multisegments){ - well->setReordering(toOrder.data(), true); - well->apply(x_msw.data(), y_msw.data()); - } - - // copy vector y from CPU to GPU - queue->enqueueWriteBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &event); - event.wait(); - } - - void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){ - if(num_std_wells > 0){ - applyStdWells(x, y); - } - - if(num_ms_wells > 0){ - applyMSWells(x, y); - } - } - - WellContributionsOCLContainer::~WellContributionsOCLContainer(){ - if(num_ms_wells > 0){ - for (auto ms : multisegments) { - delete ms; - } - } - } -} // end namespace bda diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp deleted file mode 100644 index 08d496141..000000000 --- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - Copyright 2019 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#ifndef WELLCONTRIBUTIONSOCLCONTAINER_HPP -#define WELLCONTRIBUTIONSOCLCONTAINER_HPP - -#include -#include -#include - -namespace bda -{ - class WellContributionsOCLContainer - { - private: - int N, Nb; - unsigned int dim, dim_wells; - unsigned int num_blocks = 0; - unsigned int num_std_wells = 0; - unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() - - std::vector toOrder; - std::vector x_msw, y_msw; - std::vector multisegments; - - typedef struct { - cl::Buffer Cnnzs, Dnnzs, Bnnzs; - cl::Buffer Ccols, Bcols; - cl::Buffer val_pointers, toOrder; - } GPU_storage; - - GPU_storage s; - cl::Context *context; - cl::CommandQueue *queue; - cl::make_kernel *stdwell_apply; - - void reinit(Opm::WellContributions &wellContribs); - void applyStdWells(cl::Buffer& x, cl::Buffer& y); - void applyMSWells(cl::Buffer& x, cl::Buffer& y); - - public: - WellContributionsOCLContainer() {} - ~WellContributionsOCLContainer(); - WellContributionsOCLContainer(const WellContributionsOCLContainer&) = delete; - WellContributionsOCLContainer& operator=(const WellContributionsOCLContainer&) = delete; - void apply(cl::Buffer& x, cl::Buffer& y); - void init(Opm::WellContributions &wellContribs, int N, int Nb); - void copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_); - void update_on_gpu(Opm::WellContributions &wellContribs); - void setOpenCLContext(cl::Context *context); - void setOpenCLQueue(cl::CommandQueue *queue); - void setKernel(cl::make_kernel *stdwell_apply_); - }; -} // end namespace bda - -#endif diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 185b946af..d90c9ac6f 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -46,7 +46,142 @@ using Dune::Timer; template openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_) { prec = new Preconditioner(opencl_ilu_reorder, verbosity_); - wcontainer = new WContainer(); + + cl_int err = CL_SUCCESS; + std::ostringstream out; + try { + std::vector platforms; + cl::Platform::get(&platforms); + if (platforms.size() == 0) { + OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found"); + } + out << "Found " << platforms.size() << " OpenCL platforms" << "\n"; + + if (verbosity >= 1) { + std::string platform_info; + for (unsigned int i = 0; i < platforms.size(); ++i) { + platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info); + out << "Platform name : " << platform_info << "\n"; + platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info); + out << "Platform vendor : " << platform_info << "\n"; + platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info); + out << "Platform version : " << platform_info << "\n"; + platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info); + out << "Platform profile : " << platform_info << "\n"; + platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info); + out << "Platform extensions: " << platform_info << "\n\n"; + } + } + OpmLog::info(out.str()); + out.str(""); + out.clear(); + + if (platforms.size() <= platformID) { + OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID"); + } else { + std::string platform_info; + out << "Chosen:\n"; + platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info); + out << "Platform name : " << platform_info << "\n"; + platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info); + out << "Platform version : " << platform_info << "\n"; + OpmLog::info(out.str()); + out.str(""); + out.clear(); + } + + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0}; + context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties)); + + devices = context->getInfo(); + if (devices.size() == 0){ + OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found"); + } + out << "Found " << devices.size() << " OpenCL devices" << "\n"; + + if (verbosity >= 1) { + for (unsigned int i = 0; i < devices.size(); ++i) { + std::string device_info; + std::vector work_sizes; + std::vector partitions; + + devices[i].getInfo(CL_DEVICE_NAME, &device_info); + out << "CL_DEVICE_NAME : " << device_info << "\n"; + devices[i].getInfo(CL_DEVICE_VENDOR, &device_info); + out << "CL_DEVICE_VENDOR : " << device_info << "\n"; + devices[i].getInfo(CL_DRIVER_VERSION, &device_info); + out << "CL_DRIVER_VERSION : " << device_info << "\n"; + devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info); + out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n"; + devices[i].getInfo(CL_DEVICE_PROFILE, &device_info); + out << "CL_DEVICE_PROFILE : " << device_info << "\n"; + devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info); + out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n"; + devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info); + out << "CL_DEVICE_EXTENSIONS : " << device_info << "\n"; + + devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes); + for (unsigned int j = 0; j < work_sizes.size(); ++j) { + out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n"; + } + devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions); + for (unsigned int j = 0; j < partitions.size(); ++j) { + out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n"; + } + partitions.clear(); + devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions); + for (unsigned int j = 0; j < partitions.size(); ++j) { + out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n"; + } + + // C-style properties + cl_device_id tmp_id = devices[i](); + cl_ulong size; + clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); + out << "CL_DEVICE_LOCAL_MEM_SIZE : " << size / 1024 << " KB\n"; + clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); + out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n"; + clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0); + out << "CL_DEVICE_MAX_COMPUTE_UNITS : " << size << "\n"; + clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0); + out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << size / 1024 / 1024 << " MB\n"; + clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0); + out << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << size << "\n"; + clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); + out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n\n"; + } + } + OpmLog::info(out.str()); + out.str(""); + out.clear(); + + if (devices.size() <= deviceID){ + OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID"); + } else { + std::string device_info; + out << "Chosen:\n"; + devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info); + out << "CL_DEVICE_NAME : " << device_info << "\n"; + devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info); + out << "CL_DEVICE_VERSION : " << device_info << "\n"; + OpmLog::info(out.str()); + out.str(""); + out.clear(); + } + + cl::Event event; + queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err)); + + } catch (const cl::Error& error) { + std::ostringstream oss; + oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; + oss << getErrorString(error.err()); + // rethrow exception + OPM_THROW(std::logic_error, oss.str()); + } catch (const std::logic_error& error) { + // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs + throw error; + } } @@ -177,11 +312,15 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer } template -void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { +void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { float it; double rho, rhop, beta, alpha, omega, tmp1, tmp2; double norm, norm_0; + if(wellContribs.getNumWells() > 0){ + wellContribs.setKernel(stdwell_apply_k.get()); + } + Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); // set r to the initial residual @@ -236,7 +375,9 @@ void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { // apply wellContributions t_well.start(); - wcontainer->apply(d_pw, d_v); + if(wellContribs.getNumWells() > 0){ + wellContribs.apply(d_pw, d_v, d_toOrder); + } t_well.stop(); t_rest.start(); @@ -265,7 +406,9 @@ void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { // apply wellContributions t_well.start(); - wcontainer->apply(d_s, d_t); + if(wellContribs.getNumWells() > 0){ + wellContribs.apply(d_s, d_t, d_toOrder); + } t_well.stop(); t_rest.start(); @@ -313,7 +456,7 @@ void openclSolverBackend::gpu_pbicgstab(BdaResult& res) { template -void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs) { +void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { this->N = N_; this->nnz = nnz_; this->nnzb = nnz_ / block_size / block_size; @@ -327,127 +470,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub out.str(""); out.clear(); - cl_int err = CL_SUCCESS; try { - std::vector platforms; - cl::Platform::get(&platforms); - if (platforms.size() == 0) { - OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found"); - } - out << "Found " << platforms.size() << " OpenCL platforms" << "\n"; - - if (verbosity >= 1) { - std::string platform_info; - for (unsigned int i = 0; i < platforms.size(); ++i) { - platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info); - out << "Platform name : " << platform_info << "\n"; - platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info); - out << "Platform vendor : " << platform_info << "\n"; - platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info); - out << "Platform version : " << platform_info << "\n"; - platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info); - out << "Platform profile : " << platform_info << "\n"; - platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info); - out << "Platform extensions: " << platform_info << "\n\n"; - } - } - OpmLog::info(out.str()); - out.str(""); - out.clear(); - - if (platforms.size() <= platformID) { - OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID"); - } else { - std::string platform_info; - out << "Chosen:\n"; - platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info); - out << "Platform name : " << platform_info << "\n"; - platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info); - out << "Platform version : " << platform_info << "\n"; - OpmLog::info(out.str()); - out.str(""); - out.clear(); - } - - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0}; - context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties)); - - std::vector devices = context->getInfo(); - if (devices.size() == 0){ - OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found"); - } - out << "Found " << devices.size() << " OpenCL devices" << "\n"; - - if (verbosity >= 1) { - for (unsigned int i = 0; i < devices.size(); ++i) { - std::string device_info; - std::vector work_sizes; - std::vector partitions; - - devices[i].getInfo(CL_DEVICE_NAME, &device_info); - out << "CL_DEVICE_NAME : " << device_info << "\n"; - devices[i].getInfo(CL_DEVICE_VENDOR, &device_info); - out << "CL_DEVICE_VENDOR : " << device_info << "\n"; - devices[i].getInfo(CL_DRIVER_VERSION, &device_info); - out << "CL_DRIVER_VERSION : " << device_info << "\n"; - devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info); - out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n"; - devices[i].getInfo(CL_DEVICE_PROFILE, &device_info); - out << "CL_DEVICE_PROFILE : " << device_info << "\n"; - devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info); - out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n"; - devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info); - out << "CL_DEVICE_EXTENSIONS : " << device_info << "\n"; - - devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes); - for (unsigned int j = 0; j < work_sizes.size(); ++j) { - out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n"; - } - devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions); - for (unsigned int j = 0; j < partitions.size(); ++j) { - out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n"; - } - partitions.clear(); - devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions); - for (unsigned int j = 0; j < partitions.size(); ++j) { - out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n"; - } - - // C-style properties - cl_device_id tmp_id = devices[i](); - cl_ulong size; - clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); - out << "CL_DEVICE_LOCAL_MEM_SIZE : " << size / 1024 << " KB\n"; - clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); - out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n"; - clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0); - out << "CL_DEVICE_MAX_COMPUTE_UNITS : " << size << "\n"; - clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0); - out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << size / 1024 / 1024 << " MB\n"; - clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0); - out << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << size << "\n"; - clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0); - out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n\n"; - } - } - OpmLog::info(out.str()); - out.str(""); - out.clear(); - - if (devices.size() <= deviceID){ - OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID"); - } else { - std::string device_info; - out << "Chosen:\n"; - devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info); - out << "CL_DEVICE_NAME : " << device_info << "\n"; - devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info); - out << "CL_DEVICE_VERSION : " << device_info << "\n"; - OpmLog::info(out.str()); - out.str(""); - out.clear(); - } - cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s))); source.emplace_back(std::make_pair(norm_s, strlen(norm_s))); @@ -457,16 +480,10 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s))); program = cl::Program(*context, source); - program.build(devices); - cl::Event event; - queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err)); - prec->setOpenCLContext(context.get()); prec->setOpenCLQueue(queue.get()); - wcontainer->setOpenCLContext(context.get()); - wcontainer->setOpenCLQueue(queue.get()); rb = new double[N]; tmp = new double[N]; @@ -491,7 +508,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); - wcontainer->init(wellContribs, N, Nb); + d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -510,7 +527,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply"))); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); - wcontainer->setKernel(stdwell_apply_k.get()); } catch (const cl::Error& error) { std::ostringstream oss; @@ -534,12 +550,10 @@ void openclSolverBackend::finalize() { delete[] vals_contiguous; #endif delete prec; - delete wcontainer; } // end finalize() - template -void openclSolverBackend::copy_system_to_gpu(WellContributions &wellContribs) { +void openclSolverBackend::copy_system_to_gpu() { Timer t; cl::Event event; @@ -558,11 +572,10 @@ void openclSolverBackend::copy_system_to_gpu(WellContributions &well queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices); queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers); queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb); + queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder); queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); - wcontainer->copy_to_gpu(wellContribs, toOrder); - if (verbosity > 2) { std::ostringstream out; out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s"; @@ -572,7 +585,7 @@ void openclSolverBackend::copy_system_to_gpu(WellContributions &well // don't copy rowpointers and colindices, they stay the same template -void openclSolverBackend::update_system_on_gpu(WellContributions &wellContribs) { +void openclSolverBackend::update_system_on_gpu() { Timer t; cl::Event event; @@ -592,8 +605,6 @@ void openclSolverBackend::update_system_on_gpu(WellContributions &we queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); - wcontainer->update_on_gpu(wellContribs); - if (verbosity > 2) { std::ostringstream out; out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s"; @@ -660,12 +671,12 @@ bool openclSolverBackend::create_preconditioner() { template -void openclSolverBackend::solve_system(BdaResult &res) { +void openclSolverBackend::solve_system(WellContributions &wellContribs, BdaResult &res) { Timer t; // actually solve try { - gpu_pbicgstab(res); + gpu_pbicgstab(wellContribs, res); } catch (const cl::Error& error) { std::ostringstream oss; oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n"; @@ -706,7 +717,7 @@ void openclSolverBackend::get_result(double *x) { template SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { - initialize(N_, nnz_, dim, vals, rows, cols, wellContribs); + initialize(N_, nnz_, dim, vals, rows, cols); if (analysis_done == false) { if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; @@ -716,15 +727,15 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } - copy_system_to_gpu(wellContribs); + copy_system_to_gpu(); } else { update_system(vals, b); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } - update_system_on_gpu(wellContribs); + update_system_on_gpu(); } - solve_system(res); + solve_system(wellContribs, res); return SolverStatus::BDA_SOLVER_SUCCESS; } diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 6981c2801..1c36fefcb 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -25,9 +25,10 @@ #include #include #include -#include #include +#include + namespace bda { @@ -37,7 +38,6 @@ class openclSolverBackend : public BdaSolver { typedef BdaSolver Base; typedef BILU0 Preconditioner; - typedef WellContributionsOCLContainer WContainer; using Base::N; using Base::Nb; @@ -54,19 +54,17 @@ private: double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp - bool analysis_done = false; - // OpenCL variables must be reusable, they are initialized in initialize() cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() + cl::Buffer d_toOrder; double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm() // shared pointers are also passed to other objects + std::vector devices; cl::Program program; - std::shared_ptr context; - std::shared_ptr queue; std::unique_ptr > dot_k; std::unique_ptr > norm_k; std::unique_ptr > axpy_k; @@ -80,8 +78,8 @@ private: cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 - WContainer *wcontainer = nullptr; int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings + bool analysis_done = false; std::unique_ptr > mat = nullptr; // original matrix BlockedMatrix *rmat = nullptr; // reordered matrix, used for spmv @@ -133,7 +131,7 @@ private: /// Solve linear system using ilu0-bicgstab /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void gpu_pbicgstab(BdaResult& res); + void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); /// Initialize GPU and allocate memory /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks @@ -142,13 +140,13 @@ private: /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] cols array of columnIndices, contains nnz values - void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs); + void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); /// Clean memory void finalize(); /// Copy linear system to GPU - void copy_system_to_gpu(WellContributions &wellContribs); + void copy_system_to_gpu(); /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values @@ -156,7 +154,7 @@ private: void update_system(double *vals, double *b); /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same - void update_system_on_gpu(WellContributions &wellContribs); + void update_system_on_gpu(); /// Analyse sparsity pattern to extract parallelism /// \return true iff analysis was successful @@ -169,9 +167,11 @@ private: /// Solve linear system /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void solve_system(BdaResult &res); + void solve_system(WellContributions &wellContribs, BdaResult &res); public: + std::shared_ptr context; + std::shared_ptr queue; /// Construct a openclSolver /// \param[in] linear_solver_verbosity verbosity of openclSolver diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp index ab0a94f20..089895721 100644 --- a/opm/simulators/wells/BlackoilWellModel_impl.hpp +++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp @@ -950,7 +950,6 @@ namespace Opm { // prepare for StandardWells wellContribs.setBlockSize(StandardWell::numEq, StandardWell::numStaticWellEq); -#if HAVE_CUDA for(unsigned int i = 0; i < well_container_.size(); i++){ auto& well = well_container_[i]; std::shared_ptr > derived = std::dynamic_pointer_cast >(well); @@ -963,7 +962,6 @@ namespace Opm { // allocate memory for data from StandardWells wellContribs.alloc(); -#endif for(unsigned int i = 0; i < well_container_.size(); i++){ auto& well = well_container_[i]; From 09e38a38e944a7f807e853bb0a67bff061d611a7 Mon Sep 17 00:00:00 2001 From: Jose Eduardo Bueno Date: Thu, 19 Nov 2020 15:41:59 -0300 Subject: [PATCH 2/2] Fixed CMakeLists_files.cmake --- CMakeLists_files.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 852bbe082..f9312ede4 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -171,7 +171,6 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/openclSolverBackend.hpp opm/simulators/linalg/bda/MultisegmentWellContribution.hpp opm/simulators/linalg/bda/WellContributions.hpp - opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp opm/simulators/linalg/amgcpr.hh opm/simulators/linalg/twolevelmethodcpr.hh opm/simulators/linalg/ExtractParallelGridInformationToISTL.hpp