diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 5515fc67c..790119b06 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -100,6 +100,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/CPR.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) @@ -107,11 +108,11 @@ if(OPENCL_FOUND) endif() if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR HAVE_AMGCL) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Matrix.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) endif() if(HAVE_FPGA) - list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp) @@ -247,10 +248,10 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/BdaSolver.hpp opm/simulators/linalg/bda/BILU0.hpp opm/simulators/linalg/bda/BlockedMatrix.hpp + opm/simulators/linalg/bda/CPR.hpp opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cusparseSolverBackend.hpp opm/simulators/linalg/bda/ChowPatelIlu.hpp - opm/simulators/linalg/bda/FPGAMatrix.hpp opm/simulators/linalg/bda/FPGABILU0.hpp opm/simulators/linalg/bda/FPGASolverBackend.hpp opm/simulators/linalg/bda/FPGAUtils.hpp @@ -260,6 +261,7 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/openclKernels.hpp opm/simulators/linalg/bda/openclSolverBackend.hpp opm/simulators/linalg/bda/openclWellContributions.hpp + opm/simulators/linalg/bda/Matrix.hpp opm/simulators/linalg/bda/MultisegmentWellContribution.hpp opm/simulators/linalg/bda/WellContributions.hpp opm/simulators/linalg/amgcpr.hh diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index a5020326b..a77737b87 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -146,7 +146,8 @@ namespace Opm const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); - bdaBridge.reset(new BdaBridge(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder)); + std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver); + bdaBridge.reset(new BdaBridge(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver)); } #else if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") { diff --git a/opm/simulators/linalg/PreconditionerFactory.hpp b/opm/simulators/linalg/PreconditionerFactory.hpp index 9b2c35a91..d90f180dc 100644 --- a/opm/simulators/linalg/PreconditionerFactory.hpp +++ b/opm/simulators/linalg/PreconditionerFactory.hpp @@ -122,7 +122,6 @@ public: instance().doAddCreator(type, creator); } -private: using CriterionBase = Dune::Amg::AggregationCriterion>; using Criterion = Dune::Amg::CoarsenCriterion; @@ -150,6 +149,7 @@ private: criterion.setMinAggregateSize(prm.get("minaggsize", 4)); return criterion; } +private: /// Helper struct to explicitly overload amgSmootherArgs() version for /// ParallelOverlappingILU0, since in-class specialization is not allowed. diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 7b099b0ca..1da2611ed 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -205,7 +205,7 @@ BILU0::~BILU0() Timer t_copyToGpu; events.resize(1); - queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]); + err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]); std::call_once(pattern_uploaded, [&](){ // find the positions of each diagonal block @@ -219,9 +219,9 @@ BILU0::~BILU0() diagIndex[row] = candidate - LUmat->colIndices; } events.resize(4); - queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]); - queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); - queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); + err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]); + err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); + err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); }); cl::WaitForEvents(events); diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index bac50c505..c6f46c0a3 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -63,7 +63,8 @@ BdaBridge::BdaBridge(std::string acceler double tolerance, [[maybe_unused]] unsigned int platformID, unsigned int deviceID, - [[maybe_unused]] std::string opencl_ilu_reorder) + [[maybe_unused]] std::string opencl_ilu_reorder, + [[maybe_unused]] std::string linsolver) : verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_) { if (accelerator_mode.compare("cusparse") == 0) { @@ -88,7 +89,7 @@ BdaBridge::BdaBridge(std::string acceler } else { OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'"); } - backend.reset(new Opm::Accelerator::openclSolverBackend(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder)); + backend.reset(new Opm::Accelerator::openclSolverBackend(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder, linsolver)); #else OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); #endif @@ -168,27 +169,28 @@ int checkZeroDiagonal(BridgeMatrix& mat) { // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays // sparsity pattern should stay the same // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers -template -void getSparsityPattern(BridgeMatrix& mat, std::vector &h_rows, std::vector &h_cols) { +template +void BdaBridge::getSparsityPattern(const BridgeMatrix& mat, std::vector &h_rows, std::vector &h_cols) { int sum_nnzs = 0; - // convert colIndices and rowPointers - if (h_rows.empty()) { - h_rows.emplace_back(0); - for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) { - int size_row = 0; - for (auto c = r->begin(); c != r->end(); ++c) { - h_cols.emplace_back(c.index()); - size_row++; - } - sum_nnzs += size_row; - h_rows.emplace_back(sum_nnzs); - } + h_rows.clear(); + h_cols.clear(); - // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int' - if (static_cast(h_rows[mat.N()]) != mat.nonzeroes()) { - OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()"); + // convert colIndices and rowPointers + h_rows.emplace_back(0); + for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) { + int size_row = 0; + for (auto c = r->begin(); c != r->end(); ++c) { + h_cols.emplace_back(c.index()); + size_row++; } + sum_nnzs += size_row; + h_rows.emplace_back(sum_nnzs); + } + + // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int' + if (static_cast(h_rows[mat.N()]) != mat.nonzeroes()) { + OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()"); } } // end getSparsityPattern() diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index f4b918ce9..4c8a573c4 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -54,7 +54,9 @@ public: /// \param[in] platformID the OpenCL platform ID to be used /// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation - BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); + /// \param[in] linsolver copy of cmdline argument --linsolver + BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, + unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder, std::string linsolver); /// Solve linear system, A*x = b @@ -75,6 +77,12 @@ public: return use_gpu; } + /// Store sparsity pattern into vectors + /// \param[in] mat input matrix, probably BCRSMatrix + /// \param[out] h_rows rowpointers + /// \param[out] h_cols columnindices + static void getSparsityPattern(const BridgeMatrix& mat, std::vector& h_rows, std::vector& h_cols); + /// Initialize the WellContributions object with opencl context and queue /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL /// \param[in] wellContribs container to hold all WellContributions diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index c79ba3271..d84652d28 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -76,6 +76,7 @@ namespace Accelerator { /// \param[in] tolerance required relative tolerance for solver /// \param[in] platformID the OpenCL platform to be used, only used in openclSolver /// \param[in] deviceID the device to be used + BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {}; BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {}; diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp index c78466140..cf6d9115d 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.cpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp @@ -26,6 +26,7 @@ #include #include +#include #include namespace Opm @@ -502,6 +503,8 @@ INSTANTIATE_BDA_FPGA_FUNCTIONS(1); INSTANTIATE_BDA_FPGA_FUNCTIONS(2); INSTANTIATE_BDA_FPGA_FUNCTIONS(3); INSTANTIATE_BDA_FPGA_FUNCTIONS(4); +INSTANTIATE_BDA_FPGA_FUNCTIONS(5); +INSTANTIATE_BDA_FPGA_FUNCTIONS(6); #undef INSTANTIATE_BDA_FPGA_FUNCTIONS #endif // HAVE_FPGA diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp index 6e2062659..3104db045 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.hpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp @@ -22,9 +22,15 @@ #if HAVE_FPGA #include +namespace Opm +{ +namespace Accelerator +{ + class Matrix; +} +} #endif -#include namespace Opm { diff --git a/opm/simulators/linalg/bda/CPR.cpp b/opm/simulators/linalg/bda/CPR.cpp new file mode 100644 index 000000000..fbdb72cd2 --- /dev/null +++ b/opm/simulators/linalg/bda/CPR.cpp @@ -0,0 +1,519 @@ +/* + Copyright 2021 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + + +namespace Opm +{ +namespace Accelerator +{ + +using Opm::OpmLog; +using Dune::Timer; + +template +CPR::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) : + verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_) +{} + + +template +void CPR::init(int Nb_, int nnzb_, std::shared_ptr& context_, std::shared_ptr& queue_) +{ + this->Nb = Nb_; + this->nnzb = nnzb_; + this->N = Nb_ * block_size; + this->nnz = nnzb_ * block_size * block_size; + + context = context_; + queue = queue_; + + coarse_vals.resize(nnzb); + coarse_x.resize(Nb); + coarse_y.resize(Nb); + weights.resize(N); + diagIndices.resize(1); +} + + +double get_absmax(const double *data, const int B) { + double abs_max = 0.0; + for (int i = 0; i < B; ++i) { + if (std::fabs(data[i]) > abs_max) { + abs_max = std::fabs(data[i]); + } + } + return abs_max; +} + + +// solve A^T * x = b +void solve_transposed_3x3(const double *A, const double *b, double *x) { + const int B = 3; + // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B] + double t4 = A[0+0*B] * A[1+1*B]; + double t6 = A[0+0*B] * A[1+2*B]; + double t8 = A[0+1*B] * A[1+0*B]; + double t10 = A[0+2*B] * A[1+0*B]; + double t12 = A[0+1*B] * A[2+0*B]; + double t14 = A[0+2*B] * A[2+0*B]; + + double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+ + t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant + + x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B] + - b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B] + + b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d; + + x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B] + - A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B] + + A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d; + + x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1] + - A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0] + + A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d; +} + + +template +void CPR::create_preconditioner(BlockedMatrix *mat_) { + this->mat = mat_; + + try{ + double rhs[] = {0, 0, 0}; + rhs[pressure_idx] = 1; + + // find diagonal index for each row + if (diagIndices[0].empty()) { + diagIndices[0].resize(Nb); + for (int row = 0; row < Nb; ++row) { + int start = mat->rowPointers[row]; + int end = mat->rowPointers[row + 1]; + auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row); + assert(candidate != mat->colIndices + end); + diagIndices[0][row] = candidate - mat->colIndices; + } + } + + // calculate weights for each row + for (int row = 0; row < Nb; ++row) { + // solve to find weights + double *row_weights = weights.data() + block_size * row; // weights for this row + solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights); + + // normalize weights for this row + double abs_max = get_absmax(row_weights, block_size); + for(unsigned int i = 0; i < block_size; i++){ + row_weights[i] /= abs_max; + } + } + + // transform blocks to scalars + for (int row = 0; row < Nb; ++row) { + int start = mat->rowPointers[row]; + int end = mat->rowPointers[row + 1]; + for (int idx = start; idx < end; ++idx) { + double *block = mat->nnzValues + idx * block_size * block_size; + double *row_weights = weights.data() + block_size * row; + double value = 0.0; + for (unsigned int i = 0; i < block_size; ++i) { + value += block[block_size * i + pressure_idx] * row_weights[i]; + } + coarse_vals[idx] = value; + } + } + + using Communication = Dune::OwnerOverlapCopyCommunication; + using OverlapFlags = Dune::NegateSet; + if (recalculate_aggregates) { + dune_coarse = std::make_unique(Nb, Nb, nnzb, DuneMat::row_wise); + + typedef DuneMat::CreateIterator Iter; + + // setup sparsity pattern + for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){ + int start = mat->rowPointers[row.index()]; + int end = mat->rowPointers[row.index() + 1]; + for (int idx = start; idx < end; ++idx) { + int col = mat->colIndices[idx]; + row.insert(col); + } + } + + // set values + for (int row = 0; row < Nb; ++row) { + int start = mat->rowPointers[row]; + int end = mat->rowPointers[row + 1]; + for (int idx = start; idx < end; ++idx) { + int col = mat->colIndices[idx]; + (*dune_coarse)[row][col] = coarse_vals[idx]; + } + } + + dune_op = std::make_shared(*dune_coarse); + Dune::Amg::SequentialInformation seqinfo; + dune_amg = std::make_unique(dune_op, Dune::stackobject_to_shared_ptr(seqinfo)); + + Opm::PropertyTree property_tree; + property_tree.put("alpha", 0.333333333333); + + // The matrix has a symmetric sparsity pattern, but the values are not symmetric + // Yet a SymmetricDependency is used in AMGCPR + // An UnSymmetricCriterion is also available + // using Criterion = Dune::Amg::CoarsenCriterion >; + using CriterionBase = Dune::Amg::AggregationCriterion>; + using Criterion = Dune::Amg::CoarsenCriterion; + const Criterion c = Opm::PreconditionerFactory::amgCriterion(property_tree); + + dune_amg->build(c); + + analyzeHierarchy(); + analyzeAggregateMaps(); + + recalculate_aggregates = false; + } else { + // update values of coarsest level in AMG + // this works because that level is actually a reference to the DuneMat held by dune_coarse + for (int row = 0; row < Nb; ++row) { + int start = mat->rowPointers[row]; + int end = mat->rowPointers[row + 1]; + for (int idx = start; idx < end; ++idx) { + int col = mat->colIndices[idx]; + (*dune_coarse)[row][col] = coarse_vals[idx]; + } + } + + // update the rest of the AMG hierarchy + dune_amg->recalculateGalerkin(OverlapFlags()); + analyzeHierarchy(); + } + + // initialize OpenclMatrices and Buffers if needed + std::call_once(opencl_buffers_allocated, [&](){ + d_Amatrices.reserve(num_levels); + d_Pmatrices.reserve(num_levels - 1); + d_Rmatrices.reserve(num_levels - 1); + d_invDiags.reserve(num_levels-1); + for (Matrix& m : Amatrices) { + d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs); + } + for (Matrix& m : Pmatrices) { + d_Pmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs); + d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N); // create a cl::Buffer + d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N); + } + for (Matrix& m : Rmatrices) { + d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs); + d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N); + d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N); + } + d_weights = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_rs = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_mat = std::make_unique >(context.get(), Nb, Nb, nnzb); + d_coarse_y = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb); + d_coarse_x = std::make_unique(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb); + }); + + // upload matrices and vectors to GPU + d_mat->upload(queue.get(), mat); + + err = CL_SUCCESS; + events.resize(Pmatrices.size() + 1); + for (unsigned int i = 0; i < Pmatrices.size(); ++i) { + d_Amatrices[i].upload(queue.get(), &Amatrices[i]); + + err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[i]); + } + err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[Pmatrices.size()]); + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error"); + } + for (unsigned int i = 0; i < Pmatrices.size(); ++i) { + d_Pmatrices[i].upload(queue.get(), &Pmatrices[i]); + } + for (unsigned int i = 0; i < Rmatrices.size(); ++i) { + d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]); + } + + } catch (const std::exception& ex) { + std::cerr << "Caught exception: " << ex.what() << std::endl; + throw ex; + } +} + + +template +void CPR::analyzeHierarchy() { + const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices(); + + num_levels = dune_amg->levels(); + level_sizes.resize(num_levels); + diagIndices.resize(num_levels); + + Amatrices.reserve(num_levels); + Pmatrices.reserve(num_levels - 1); // coarsest level does not need one + Rmatrices.reserve(num_levels - 1); + invDiags.reserve(num_levels); + + Amatrices.clear(); + invDiags.clear(); + + // matrixIter.dereference() returns MatrixAdapter + // matrixIter.dereference().getmat() returns BCRSMat + DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest(); + for(int level = 0; level < num_levels; ++matrixIter, ++level) { + const auto& A = matrixIter.dereference().getmat(); + level_sizes[level] = A.N(); + diagIndices[level].reserve(A.N()); + + // extract matrix A + Amatrices.emplace_back(A.N(), A.nonzeroes()); + // contiguous copy is not possible + // std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data()); + // also update diagonal indices if needed, level 0 is already filled in create_preconditioner() + int nnz_idx = 0; + const bool fillDiagIndices = diagIndices[level].empty(); + for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) { + for (auto c = r->begin(); c != r->end(); ++c) { + Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()]; + if (fillDiagIndices && r.index() == c.index()) { + diagIndices[level].emplace_back(nnz_idx); + } + nnz_idx++; + } + } + + Opm::BdaBridge::getSparsityPattern(A, Amatrices.back().rowPointers, Amatrices.back().colIndices); + + // compute inverse diagonal values for current level + invDiags.emplace_back(A.N()); + for (unsigned int row = 0; row < A.N(); ++row) { + invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]]; + } + } +} + + +template +void CPR::analyzeAggregateMaps() { + + Pmatrices.clear(); + Rmatrices.clear(); + + const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps(); + + DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin(); + for(int level = 0; level < num_levels - 1; ++mapIter, ++level) { + DuneAmg::AggregatesMap *map = *mapIter; + + // get indices for each row of P and R + std::vector > indicesP(level_sizes[level]); + std::vector > indicesR(level_sizes[level+1]); + using AggregateIterator = DuneAmg::AggregatesMap::const_iterator; + for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){ + if (*ai != DuneAmg::AggregatesMap::ISOLATED) { + const long int diff = ai - map->begin(); + indicesP[diff].insert(*ai); + indicesR[*ai].insert(diff); + } + } + + Pmatrices.emplace_back(level_sizes[level], level_sizes[level+1], level_sizes[level]); + Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]); + std::fill(Pmatrices.back().nnzValues.begin(), Pmatrices.back().nnzValues.end(), 1.0); + std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0); + + // set sparsity pattern of P + int col_idx = 0; + Pmatrices.back().rowPointers[0] = 0; + for (unsigned int i = 0; i < indicesP.size(); ++i) { + Pmatrices.back().rowPointers[i + 1] = Pmatrices.back().rowPointers[i] + indicesP[i].size(); + for (auto it = indicesP[i].begin(); it != indicesP[i].end(); ++it) { + Pmatrices.back().colIndices[col_idx] = *it; + col_idx++; + } + } + // set sparsity pattern of R + col_idx = 0; + Rmatrices.back().rowPointers[0] = 0; + for (unsigned int i = 0; i < indicesR.size(); ++i) { + Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size(); + for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) { + Rmatrices.back().colIndices[col_idx] = *it; + col_idx++; + } + } + } + +} + + +void solve_coarse_umfpack(const Matrix *A, std::vector &y, std::vector &x) { + const int N = A->N; + const int nnzs = A->nnzs; + + using DuneMat = Dune::BCRSMatrix >; + DuneMat B(N, N, nnzs, DuneMat::row_wise); + + typedef DuneMat::CreateIterator Iter; + + // setup sparsity pattern + for(Iter row = B.createbegin(); row != B.createend(); ++row){ + int start = A->rowPointers[row.index()]; + int end = A->rowPointers[row.index() + 1]; + for (int idx = start; idx < end; ++idx) { + int col = A->colIndices[idx]; + row.insert(col); + } + } + + // set values + for (int row = 0; row < N; ++row) { + int start = A->rowPointers[row]; + int end = A->rowPointers[row + 1]; + for (int idx = start; idx < end; ++idx) { + int col = A->colIndices[idx]; + B[row][col] = A->nnzValues[idx]; + } + } + + // create umfpack object + Dune::UMFPack umfpack(B, 0); + + umfpack.apply(x.data(), y.data()); +} + + +template +void CPR::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) { + OpenclMatrix<1> *A = &d_Amatrices[level]; + OpenclMatrix<1> *P = &d_Pmatrices[level]; + OpenclMatrix<1> *R = &d_Rmatrices[level]; + int Ncur = A->Nb; + + if (level == num_levels - 1) { + // solve coarsest level + std::vector h_y(Ncur), h_x(Ncur, 0); + + events.resize(1); + err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]); + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error"); + } + + solve_coarse_umfpack(&Amatrices[level], h_y, h_x); + + events.resize(1); + err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]); + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error"); + } + return; + } + int Nnext = d_Amatrices[level+1].Nb; + + cl::Buffer& t = d_t[level]; + cl::Buffer& f = d_f[level]; + cl::Buffer& u = d_u[level]; // u was 0-initialized earlier + + // presmooth + double jacobi_damping = 0.72; // default value in amgcl: 0.72 + OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); + OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); + + // move to coarser level + OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); + OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true); + amg_cycle_gpu(level + 1, f, u); + OpenclKernels::spmv(P->nnzValues, P->colIndices, P->rowPointers, u, x, Ncur, 1, false); + + // postsmooth + OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); + OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); +} + + +// x = prec(y) +template +void CPR::apply(const cl::Buffer& y, cl::Buffer& x) { + // 0-initialize u and x vectors + events.resize(d_u.size() + 1); + err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]); + for (unsigned int i = 0; i < d_u.size(); ++i) { + err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]); + } + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error"); + } + + OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size); + OpenclKernels::move_to_coarse(*d_rs, *d_weights, *d_coarse_y, Nb); + + amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x); + + OpenclKernels::move_to_fine(*d_coarse_x, x, pressure_idx, Nb); +} + + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template CPR::CPR(int, ILUReorder); \ +template void CPR::init(int, int, std::shared_ptr&, std::shared_ptr&); \ +template void CPR::apply(const cl::Buffer&, cl::Buffer&); \ +template void CPR::create_preconditioner(BlockedMatrix *mat); + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); +INSTANTIATE_BDA_FUNCTIONS(5); +INSTANTIATE_BDA_FUNCTIONS(6); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} // namespace Accelerator +} // namespace Opm + + diff --git a/opm/simulators/linalg/bda/CPR.hpp b/opm/simulators/linalg/bda/CPR.hpp new file mode 100644 index 000000000..24745ceb7 --- /dev/null +++ b/opm/simulators/linalg/bda/CPR.hpp @@ -0,0 +1,117 @@ +/* + Copyright 2021 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef CPR_HPP +#define CPR_HPP + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace Opm +{ +namespace Accelerator +{ + +template +class openclSolverBackend; + +/// This class implements a Constrained Pressure Residual (CPR) preconditioner +template +class CPR +{ + +private: + int N; // number of rows of the matrix + int Nb; // number of blockrows of the matrix + int nnz; // number of nonzeroes of the matrix (scalar) + int nnzb; // number of blocks of the matrix + int verbosity; + + int num_levels; + std::vector weights, coarse_vals, coarse_x, coarse_y; + std::vector Amatrices, Pmatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy + std::vector > d_Amatrices, d_Pmatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy + std::vector > invDiags; // inverse of diagonal of Amatrices + std::vector d_invDiags; + std::vector d_t, d_f, d_u; // intermediate vectors used during amg cycle + std::unique_ptr d_rs; // use before extracting the pressure + std::unique_ptr d_weights; // the quasiimpes weights, used to extract pressure + std::unique_ptr > d_mat; // stores blocked matrix + std::unique_ptr d_coarse_y, d_coarse_x; // stores the scalar vectors + std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once + + BlockedMatrix *mat = nullptr; // input matrix, blocked + using DuneMat = Dune::BCRSMatrix >; + using DuneVec = Dune::BlockVector >; + using MatrixOperator = Dune::MatrixAdapter; + using DuneAmg = Dune::Amg::MatrixHierarchy; + std::unique_ptr dune_amg; + std::unique_ptr dune_coarse; // extracted pressure matrix, finest level in AMG hierarchy + std::shared_ptr dune_op; // operator, input to Dune AMG + std::vector level_sizes; // size of each level in the AMG hierarchy + std::vector > diagIndices; // index of diagonal value for each level + bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default + bool recalculate_aggregates = true; // only rerecalculate if true + const int pressure_idx = 1; // hardcoded to mimic OPM + + std::unique_ptr > coarse_solver; // coarse solver is scalar + ILUReorder opencl_ilu_reorder; // reordering strategy for ILU0 in coarse solver + + std::shared_ptr context; + std::shared_ptr queue; + std::vector events; + cl_int err; + + // Analyze the AMG hierarchy build by Dune + void analyzeHierarchy(); + + // Analyze the aggregateMaps from the AMG hierarchy + // These can be reused, so only use when recalculate_aggregates is true + void analyzeAggregateMaps(); + + void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x); + +public: + + CPR(int verbosity, ILUReorder opencl_ilu_reorder); + + void init(int Nb, int nnzb, std::shared_ptr& context, std::shared_ptr& queue); + + // apply preconditioner, x = prec(y) + void apply(const cl::Buffer& y, cl::Buffer& x); + + void create_preconditioner(BlockedMatrix *mat); + +}; + +} // namespace Accelerator +} // namespace Opm + +#endif + diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp index b03e26f93..82a9577cf 100644 --- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp +++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include diff --git a/opm/simulators/linalg/bda/FPGAMatrix.cpp b/opm/simulators/linalg/bda/Matrix.cpp similarity index 86% rename from opm/simulators/linalg/bda/FPGAMatrix.cpp rename to opm/simulators/linalg/bda/Matrix.cpp index f8b4193ff..389a026cf 100644 --- a/opm/simulators/linalg/bda/FPGAMatrix.cpp +++ b/opm/simulators/linalg/bda/Matrix.cpp @@ -17,10 +17,13 @@ along with OPM. If not, see . */ +#include + #include #include -#include +#include +#include #include namespace Opm @@ -28,6 +31,32 @@ namespace Opm namespace Accelerator { +template +void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) { + std::vector events(3); + + cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]); + err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]); + err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "OpenclMatrix OpenCL enqueueWriteBuffer error"); + } +} + +template +void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) { + upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data()); +} + +template +void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) { + upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers); +} + /*Sort a row of matrix elements from a CSR-format.*/ void sortRow(int *colIndices, double *data, int left, int right) { int l = left; @@ -57,6 +86,7 @@ void sortRow(int *colIndices, double *data, int left, int right) { } +#if HAVE_FPGA /* * Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows: * - The "colorSizes" array, which first contains the number of rows, columns, non-zero values @@ -247,6 +277,20 @@ int Matrix::toRDF(int numColors, std::vector& nodesPerColor, return 0; } +#endif + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template class OpenclMatrix; + + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); +INSTANTIATE_BDA_FUNCTIONS(5); +INSTANTIATE_BDA_FUNCTIONS(6); + +#undef INSTANTIATE_BDA_FUNCTIONS } // namespace Accelerator } // namespace Opm diff --git a/opm/simulators/linalg/bda/FPGAMatrix.hpp b/opm/simulators/linalg/bda/Matrix.hpp similarity index 60% rename from opm/simulators/linalg/bda/FPGAMatrix.hpp rename to opm/simulators/linalg/bda/Matrix.hpp index 93d3f0253..bd2f41469 100644 --- a/opm/simulators/linalg/bda/FPGAMatrix.hpp +++ b/opm/simulators/linalg/bda/Matrix.hpp @@ -22,36 +22,77 @@ #include +#include +#include + namespace Opm { namespace Accelerator { +class Matrix; + +/// This struct resembles a csr matrix, only doubles are supported +/// The matrix data is stored in OpenCL Buffers +template +class OpenclMatrix { +public: + + OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_) + : Nb(Nb_), + Mb(Mb_), + nnzbs(nnzbs_) + { + nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs); + colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs); + rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); + } + + void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows); + void upload(cl::CommandQueue *queue, Matrix *matrix); + void upload(cl::CommandQueue *queue, BlockedMatrix *matrix); + + cl::Buffer nnzValues; + cl::Buffer colIndices; + cl::Buffer rowPointers; + int Nb, Mb; + int nnzbs; +}; + + /// This struct resembles a csr matrix, only doubles are supported /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer. class Matrix { public: - /// Allocate Matrix and data arrays with given sizes + /// Allocate square Matrix and data arrays with given sizes /// \param[in] N number of rows /// \param[in] nnzs number of nonzeros Matrix(int N_, int nnzs_) - : nnzValues(new double[nnzs_]), - colIndices(new int[nnzs_]), - rowPointers(new int[N_+1]), - N(N_), + : N(N_), nnzs(nnzs_) - {} - - /// All constructors allocate new memory, so always delete here - ~Matrix(){ - delete[] nnzValues; - delete[] colIndices; - delete[] rowPointers; + { + nnzValues.resize(nnzs); + colIndices.resize(nnzs); + rowPointers.resize(N+1); } + /// Allocate rectangular Matrix and data arrays with given sizes + /// \param[in] N number of rows + /// \param[in] M number of columns + /// \param[in] nnzs number of nonzeros + Matrix(int N_, int M_, int nnzs_) + : N(N_), + M(M_), + nnzs(nnzs_) + { + nnzValues.resize(nnzs); + colIndices.resize(nnzs); + rowPointers.resize(N+1); + } +#if HAVE_FPGA /// Converts this matrix to the dataformat used by the FPGA. /// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset). /// The purpose of this format is to allow the data to be streamable. @@ -71,11 +112,12 @@ public: int toRDF(int numColors, std::vector& nodesPerColor, std::vector >& colIndicesInColor, int nnzsPerRowLimit, std::vector >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes); +#endif - double *nnzValues; - int *colIndices; - int *rowPointers; - int N; + std::vector nnzValues; + std::vector colIndices; + std::vector rowPointers; + int N, M; int nnzs; }; diff --git a/opm/simulators/linalg/bda/openclKernels.cpp b/opm/simulators/linalg/bda/openclKernels.cpp index 73871e1ff..df449fc29 100644 --- a/opm/simulators/linalg/bda/openclKernels.cpp +++ b/opm/simulators/linalg/bda/openclKernels.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -45,8 +46,15 @@ std::unique_ptr > OpenclKernels::norm_k; std::unique_ptr > OpenclKernels::axpy_k; std::unique_ptr > OpenclKernels::scale_k; +std::unique_ptr > OpenclKernels::vmul_k; std::unique_ptr > OpenclKernels::custom_k; -std::unique_ptr OpenclKernels::spmv_blocked_k; +std::unique_ptr > OpenclKernels::move_to_coarse_k; +std::unique_ptr > OpenclKernels::move_to_fine_k; +std::unique_ptr OpenclKernels::spmv_blocked_k; +std::unique_ptr OpenclKernels::spmv_k; +std::unique_ptr OpenclKernels::spmv_noreset_k; +std::unique_ptr OpenclKernels::residual_blocked_k; +std::unique_ptr OpenclKernels::residual_k; std::unique_ptr OpenclKernels::ILU_apply1_k; std::unique_ptr OpenclKernels::ILU_apply2_k; std::unique_ptr OpenclKernels::stdwell_apply_k; @@ -79,14 +87,28 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve add_kernel_string(sources, axpy_s); const std::string& scale_s = get_scale_string(); add_kernel_string(sources, scale_s); + const std::string& vmul_s = get_vmul_string(); + add_kernel_string(sources, vmul_s); const std::string& dot_1_s = get_dot_1_string(); add_kernel_string(sources, dot_1_s); const std::string& norm_s = get_norm_string(); add_kernel_string(sources, norm_s); const std::string& custom_s = get_custom_string(); add_kernel_string(sources, custom_s); - const std::string& spmv_blocked_s = get_spmv_blocked_string(); + const std::string& move_to_coarse_s = get_move_to_coarse_string(); + add_kernel_string(sources, move_to_coarse_s); + const std::string& move_to_fine_s = get_move_to_fine_string(); + add_kernel_string(sources, move_to_fine_s); + const std::string& spmv_blocked_s = get_blocked_matrix_operation_string(matrix_operation::spmv_op); add_kernel_string(sources, spmv_blocked_s); + const std::string& spmv_s = get_matrix_operation_string(matrix_operation::spmv_op, true); + add_kernel_string(sources, spmv_s); + const std::string& spmv_noreset_s = get_matrix_operation_string(matrix_operation::spmv_op, false); + add_kernel_string(sources, spmv_noreset_s); + const std::string& residual_blocked_s = get_blocked_matrix_operation_string(matrix_operation::residual_op); + add_kernel_string(sources, residual_blocked_s); + const std::string& residual_s = get_matrix_operation_string(matrix_operation::residual_op); + add_kernel_string(sources, residual_s); #if CHOW_PATEL bool ilu_operate_on_full_matrix = false; #else @@ -114,8 +136,15 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve norm_k.reset(new cl::KernelFunctor(cl::Kernel(program, "norm"))); axpy_k.reset(new cl::KernelFunctor(cl::Kernel(program, "axpy"))); scale_k.reset(new cl::KernelFunctor(cl::Kernel(program, "scale"))); + vmul_k.reset(new cl::KernelFunctor(cl::Kernel(program, "vmul"))); custom_k.reset(new cl::KernelFunctor(cl::Kernel(program, "custom"))); - spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked"))); + move_to_coarse_k.reset(new cl::KernelFunctor(cl::Kernel(program, "move_to_coarse"))); + move_to_fine_k.reset(new cl::KernelFunctor(cl::Kernel(program, "move_to_fine"))); + spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked"))); + spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv"))); + spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset"))); + residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked"))); + residual_k.reset(new residual_kernel_type(cl::Kernel(program, "residual"))); ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); @@ -219,6 +248,23 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N) } } +void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(N, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + Timer t_vmul; + + cl::Event event = (*vmul_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), alpha, in1, in2, out, N); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels vmul() time: " << t_vmul.stop() << " s"; + OpmLog::info(oss.str()); + } +} + void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N) { const unsigned int work_group_size = 32; @@ -236,15 +282,58 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const do } } -void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size) +void OpenclKernels::move_to_coarse(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + Timer t; + + cl::Event event = (*move_to_coarse_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), fine_y, weights, coarse_y, Nb); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels move_to_coarse() time: " << t.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void OpenclKernels::move_to_fine(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + Timer t; + + cl::Event event = (*move_to_fine_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), coarse_x, fine_x, pressure_idx, Nb); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels move_to_fine() time: " << t.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_spmv; + cl::Event event; - cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + if (block_size > 1) { + event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + } else { + if (reset) { + event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); + } else { + event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); + } + } if (verbosity >= 4) { event.wait(); @@ -254,6 +343,29 @@ void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& } } +void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + Timer t_residual; + cl::Event event; + + if (block_size > 1) { + event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group)); + } else { + event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group)); + } + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels residual_blocked() time: " << t_residual.stop() << " s"; + OpmLog::info(oss.str()); + } +} + void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) { @@ -395,6 +507,27 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe )"; } + // multiply vector with another vector, element-wise + std::string OpenclKernels::get_vmul_string() { + return R"( + __kernel void vmul( + const double alpha, + __global double const *in1, + __global double const *in2, + __global double *out, + const int N) + { + unsigned int NUM_THREADS = get_global_size(0); + int idx = get_global_id(0); + + while(idx < N){ + out[idx] += alpha * in1[idx] * in2[idx]; + idx += NUM_THREADS; + } + } + )"; + } + // returns partial sums, instead of the final dot product std::string OpenclKernels::get_dot_1_string() { @@ -506,15 +639,77 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe )"; } - std::string OpenclKernels::get_spmv_blocked_string() { + + // transform blocked vector to scalar vector using pressure-weights + // every workitem handles one blockrow + std::string OpenclKernels::get_move_to_coarse_string() { return R"( - __kernel void spmv_blocked( - __global const double *vals, + __kernel void move_to_coarse( + __global const double *fine_y, + __global const double *weights, + __global double *coarse_y, + const unsigned int Nb) + { + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int block_size = 3; + unsigned int target_block_row = get_global_id(0); + + while(target_block_row < Nb){ + double sum = 0.0; + unsigned int idx = block_size * target_block_row; + for (unsigned int i = 0; i < block_size; ++i) { + sum += fine_y[idx + i] * weights[idx + i]; + } + coarse_y[target_block_row] = sum; + target_block_row += NUM_THREADS; + } + } + )"; + } + + // add the coarse pressure solution back to the finer, complete solution + // every workitem handles one blockrow + std::string OpenclKernels::get_move_to_fine_string() { + return R"( + __kernel void move_to_fine( + __global const double *coarse_x, + __global double *fine_x, + const unsigned int pressure_idx, + const unsigned int Nb) + { + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int block_size = 3; + unsigned int target_block_row = get_global_id(0); + + while(target_block_row < Nb){ + fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row]; + target_block_row += NUM_THREADS; + } + } + )"; + } + + +/// either b = mat * x +/// or res = rhs - mat * x +std::string OpenclKernels::get_blocked_matrix_operation_string(matrix_operation op) { + std::string s; + if (op == matrix_operation::spmv_op) { + s += "__kernel void spmv_blocked("; + } else { + s += "__kernel void residual_blocked("; + } + s += R"(__global const double *vals, __global const int *cols, __global const int *rows, const int Nb, __global const double *x, - __global double *b, + )"; + if (op == matrix_operation::residual_op) { + s += "__global const double *rhs,"; + } + s += R"( + __global double *out, const unsigned int block_size, __local double *tmp) { @@ -566,15 +761,97 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe if(lane < bs){ unsigned int row = target_block_row*bs + lane; - b[row] = tmp[lane]; + )"; + if (op == matrix_operation::spmv_op) { + s += " out[row] = tmp[lane];"; + } else { + s += " out[row] = rhs[row] - tmp[lane];"; + } + s += R"( } target_block_row += num_warps_in_grid; } } - )"; - } + )"; + return s; +} +/// either b = mat * x +/// or res = rhs - mat * x +std::string OpenclKernels::get_matrix_operation_string(matrix_operation op, bool spmv_reset) { + std::string s; + if (op == matrix_operation::spmv_op) { + if (spmv_reset) { + s += "__kernel void spmv("; + } else { + s += "__kernel void spmv_noreset("; + } + } else { + s += "__kernel void residual("; + } + s += R"(__global const double *vals, + __global const int *cols, + __global const int *rows, + const int N, + __global const double *x, + )"; + if (op == matrix_operation::residual_op) { + s += "__global const double *rhs,"; + } + s += R"( + __global double *out, + __local double *tmp) + { + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_workgroups = get_num_groups(0); + + int row = idx_b; + + while (row < N) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + int rowLength = rowEnd - rowStart; + double local_sum = 0.0; + for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { + int col = cols[j]; + local_sum += vals[j] * x[col]; + } + + tmp[idx_t] = local_sum; + barrier(CLK_LOCAL_MEM_FENCE); + + int offset = bsize / 2; + while(offset > 0) { + if (idx_t < offset) { + tmp[idx_t] += tmp[idx_t + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + offset = offset / 2; + } + + if (idx_t == 0) { + )"; + if (op == matrix_operation::spmv_op) { + if (spmv_reset) { + s += " out[row] = tmp[idx_t];"; + } else { + s += " out[row] += tmp[idx_t];"; + } + } else { + s += " out[row] = rhs[row] - tmp[idx_t];"; + } + s += R"( + } + row += num_workgroups; + } + } + )"; + return s; +} + std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) { std::string s = R"( diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index c712595ff..1aaab6bd5 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -30,8 +30,14 @@ namespace Opm namespace Accelerator { -using spmv_kernel_type = cl::KernelFunctor; +using spmv_kernel_type = cl::KernelFunctor; +using residual_blocked_kernel_type = cl::KernelFunctor; +using residual_kernel_type = cl::KernelFunctor; using ilu_apply1_kernel_type = cl::KernelFunctor; using ilu_apply2_kernel_type = cl::KernelFunctor tmp; // used as tmp CPU buffer for dot() and norm() static bool initialized; + enum matrix_operation { + spmv_op, + residual_op + }; + static std::unique_ptr > dot_k; static std::unique_ptr > norm_k; static std::unique_ptr > axpy_k; static std::unique_ptr > scale_k; + static std::unique_ptr > vmul_k; static std::unique_ptr > custom_k; - static std::unique_ptr spmv_blocked_k; + static std::unique_ptr > move_to_coarse_k; + static std::unique_ptr > move_to_fine_k; + static std::unique_ptr spmv_blocked_k; + static std::unique_ptr spmv_k; + static std::unique_ptr spmv_noreset_k; + static std::unique_ptr residual_blocked_k; + static std::unique_ptr residual_k; static std::unique_ptr ILU_apply1_k; static std::unique_ptr ILU_apply2_k; static std::unique_ptr stdwell_apply_k; @@ -75,6 +93,10 @@ private: /// a = a * alpha static std::string get_scale_string(); + /// multiply vector with another vector and a scalar, element-wise + /// add result to a third vector + static std::string get_vmul_string(); + /// returns partial sums, instead of the final dot product /// partial sums are added on CPU static std::string get_dot_1_string(); @@ -88,11 +110,20 @@ private: /// p = (p - omega * v) * beta + r static std::string get_custom_string(); + /// Transform blocked vector to scalar vector using pressure-weights + static std::string get_move_to_coarse_string(); + + /// Add the coarse pressure solution back to the finer, complete solution + static std::string get_move_to_fine_string(); + /// b = mat * x /// algorithm based on: /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 - static std::string get_spmv_blocked_string(); + /// or + /// res = rhs - (mat * x) + static std::string get_blocked_matrix_operation_string(matrix_operation op); + static std::string get_matrix_operation_string(matrix_operation op, bool spmv_reset = true); /// ILU apply part 1: forward substitution /// solves L*x=y where L is a lower triangular sparse blocked matrix @@ -127,8 +158,13 @@ public: static double norm(cl::Buffer& in, cl::Buffer& out, int N); static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N); static void scale(cl::Buffer& in, const double a, int N); + static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N); - static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size); + static void move_to_coarse(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb); + static void move_to_fine(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb); + static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true); + static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size); + static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index c0fa025fb..b93f70bcc 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -45,8 +45,20 @@ using Opm::OpmLog; using Dune::Timer; template -openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) { - prec = new Preconditioner(opencl_ilu_reorder, verbosity_); +openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) { + + if (linsolver.compare("ilu0") == 0) { + use_cpr = false; + } else if (linsolver.compare("cpr_quasiimpes") == 0) { + use_cpr = true; + } else { + OPM_THROW(std::logic_error, "Error unknown value for argument --linsolver, " + linsolver); + } + + bilu0 = std::make_unique >(opencl_ilu_reorder, verbosity_); + if (use_cpr) { + cpr = std::make_unique >(verbosity_, opencl_ilu_reorder); + } std::ostringstream out; try { @@ -191,6 +203,18 @@ openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, } } +template +openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder_) : + BdaSolver(verbosity_, maxit_, tolerance_), use_cpr(false), opencl_ilu_reorder(opencl_ilu_reorder_) +{ + bilu0 = std::make_unique >(opencl_ilu_reorder, verbosity_); +} + +template +void openclSolverBackend::setOpencl(std::shared_ptr& context_, std::shared_ptr& queue_) { + context = context_; + queue = queue_; +} template openclSolverBackend::~openclSolverBackend() { @@ -252,12 +276,16 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr // pw = prec(p) t_prec.start(); - prec->apply(d_p, d_pw); + bilu0->apply(d_p, d_pw); t_prec.stop(); + if (use_cpr) { + cpr->apply(d_p, d_pw); + } + // v = A * pw t_spmv.start(); - OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); + OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); t_spmv.stop(); // apply wellContributions @@ -283,12 +311,16 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr // s = prec(r) t_prec.start(); - prec->apply(d_r, d_s); + bilu0->apply(d_r, d_s); t_prec.stop(); + if (use_cpr) { + cpr->apply(d_r, d_s); + } + // t = A * s t_spmv.start(); - OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); + OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); t_spmv.stop(); // apply wellContributions @@ -358,8 +390,12 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub out.clear(); try { - prec->setOpenCLContext(context.get()); - prec->setOpenCLQueue(queue.get()); + bilu0->setOpenCLContext(context.get()); + bilu0->setOpenCLQueue(queue.get()); + + if (use_cpr) { + cpr->init(Nb, nnzb, context, queue); + } #if COPY_ROW_BY_ROW vals_contiguous = new double[N]; @@ -411,7 +447,6 @@ void openclSolverBackend::finalize() { #if COPY_ROW_BY_ROW delete[] vals_contiguous; #endif - delete prec; } // end finalize() template @@ -492,14 +527,14 @@ template bool openclSolverBackend::analyse_matrix() { Timer t; - bool success = prec->init(mat.get()); + bool success = bilu0->init(mat.get()); if (opencl_ilu_reorder == ILUReorder::NONE) { rmat = mat.get(); } else { - toOrder = prec->getToOrder(); - fromOrder = prec->getFromOrder(); - rmat = prec->getRMat(); + toOrder = bilu0->getToOrder(); + fromOrder = bilu0->getFromOrder(); + rmat = bilu0->getRMat(); } if (verbosity > 2) { @@ -539,7 +574,14 @@ template bool openclSolverBackend::create_preconditioner() { Timer t; - bool result = prec->create_preconditioner(mat.get()); + bool result = bilu0->create_preconditioner(mat.get()); + if (use_cpr) { + if (opencl_ilu_reorder == ILUReorder::NONE) { + cpr->create_preconditioner(mat.get()); + } else { + cpr->create_preconditioner(bilu0->getRMat()); + } + } if (verbosity > 2) { std::ostringstream out; @@ -624,8 +666,11 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int } -#define INSTANTIATE_BDA_FUNCTIONS(n) \ -template openclSolverBackend::openclSolverBackend(int, int, double, unsigned int, unsigned int, ILUReorder); \ +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template openclSolverBackend::openclSolverBackend( \ + int, int, double, unsigned int, unsigned int, ILUReorder, std::string); \ +template openclSolverBackend::openclSolverBackend(int, int, double, ILUReorder); \ +template void openclSolverBackend::setOpencl(std::shared_ptr&, std::shared_ptr&); INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 139cdb5f0..e23a3df33 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -35,12 +36,14 @@ namespace Opm namespace Accelerator { +template +class CPR; + /// This class implements a opencl-based ilu0-bicgstab solver on GPU template class openclSolverBackend : public BdaSolver { typedef BdaSolver Base; - typedef BILU0 Preconditioner; using Base::N; using Base::Nb; @@ -66,7 +69,10 @@ private: std::vector devices; - Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 + std::unique_ptr > bilu0; // Blocked ILU0 preconditioner + std::unique_ptr > cpr; // Constrained Pressure Residual preconditioner + bool is_root; // allow for nested solvers, the root solver is called by BdaBridge + bool use_cpr; // allow to enable CPR int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings bool analysis_done = false; std::unique_ptr > mat = nullptr; // original matrix @@ -178,7 +184,13 @@ public: /// \param[in] platformID the OpenCL platform to be used /// \param[in] deviceID the device to be used /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation - openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, ILUReorder opencl_ilu_reorder); + /// \param[in] linsolver indicating the preconditioner, equal to the --linsolver cmdline argument + /// only ilu0 and cpr_quasiimpes are supported + openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, + ILUReorder opencl_ilu_reorder, std::string linsolver); + + /// For the CPR coarse solver + openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder); /// Destroy a openclSolver, and free memory ~openclSolverBackend(); @@ -200,10 +212,20 @@ public: /// \return status code SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + /// Solve scalar linear system, for example a coarse system of an AMG preconditioner + /// Data is already on the GPU + // SolverStatus solve_system(BdaResult &res); + /// Get result after linear solve, and peform postprocessing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; + /// Set OpenCL objects + /// This class either creates them based on platformID and deviceID or receives them through this function + /// \param[in] context the opencl context to be used + /// \param[in] queue the opencl queue to be used + void setOpencl(std::shared_ptr& context, std::shared_ptr& queue); + }; // end class openclSolverBackend } // namespace Accelerator