From 98ddf47b4497a96a51c2a1e9f3c2aee2d6186ef9 Mon Sep 17 00:00:00 2001 From: "T.D. (Tongdong) Qiu" Date: Wed, 24 Jun 2020 20:09:14 +0200 Subject: [PATCH] Added block_size template to BdaSolvers and BILU0 --- CMakeLists_files.cmake | 1 + opm/simulators/linalg/bda/BILU0.cpp | 51 +++++++--- opm/simulators/linalg/bda/BILU0.hpp | 4 +- opm/simulators/linalg/bda/BdaBridge.cpp | 18 ++-- opm/simulators/linalg/bda/BdaBridge.hpp | 2 +- opm/simulators/linalg/bda/BdaSolver.hpp | 14 +-- opm/simulators/linalg/bda/BdaSolverStatus.hpp | 42 +++++++++ .../linalg/bda/cusparseSolverBackend.cu | 61 ++++++++---- .../linalg/bda/cusparseSolverBackend.hpp | 18 +++- .../linalg/bda/openclSolverBackend.cpp | 92 ++++++++++++------- .../linalg/bda/openclSolverBackend.hpp | 20 +++- 11 files changed, 232 insertions(+), 91 deletions(-) create mode 100644 opm/simulators/linalg/bda/BdaSolverStatus.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 7dfddd12c..d524dc96b 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -150,6 +150,7 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/BdaBridge.hpp opm/simulators/linalg/bda/BdaResult.hpp opm/simulators/linalg/bda/BdaSolver.hpp + opm/simulators/linalg/bda/BdaSolverStatus.hpp opm/simulators/linalg/bda/BILU0.hpp opm/simulators/linalg/bda/BlockedMatrix.hpp opm/simulators/linalg/bda/cuda_header.hpp diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index e42ddbe7f..632a758ea 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -35,12 +35,12 @@ namespace bda using Opm::OpmLog; - BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) : - // define 'second' as 'BdaSolver<>::second', this allows usage of the second() function for timing // typedefs cannot handle templates const auto second = BdaSolver<>::second; + template + BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) : level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_) { if (level_scheduling == graph_coloring) { @@ -49,7 +49,8 @@ namespace bda double t1 = second(); } - BILU0::~BILU0() + template + BILU0::~BILU0() { delete[] invDiagVals; delete[] diagIndex; @@ -62,7 +63,8 @@ namespace bda freeBlockedMatrix(&rMat); } - bool BILU0::init(BlockedMatrix *mat, unsigned int block_size) + template + bool BILU0::init(BlockedMatrix *mat) { double t1 = 0.0, t2 = 0.0; BlockedMatrix *CSCmat = nullptr; @@ -71,7 +73,6 @@ namespace bda this->Nb = mat->Nb; this->nnz = mat->nnzbs * block_size * block_size; this->nnzbs = mat->nnzbs; - this->block_size = block_size; toOrder = new int[N]; fromOrder = new int[N]; @@ -160,7 +161,8 @@ namespace bda } // end init() - bool BILU0::create_preconditioner(BlockedMatrix *mat) + template + bool BILU0::create_preconditioner(BlockedMatrix *mat) { double t1 = 0.0, t2 = 0.0; if (verbosity >= 3){ @@ -306,13 +308,13 @@ namespace bda // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed // however, if individual kernel calls are timed, waiting for events is needed // behavior on other GPUs is untested - void BILU0::apply(cl::Buffer& x, cl::Buffer& y) + template + void BILU0::apply(cl::Buffer& x, cl::Buffer& y) { double t1 = 0.0, t2 = 0.0; if (verbosity >= 3) { t1 = second(); } - const unsigned int block_size = 3; cl::Event event; for(unsigned int color = 0; color < numColors; ++color){ @@ -334,18 +336,22 @@ namespace bda } - void BILU0::setOpenCLContext(cl::Context *context){ + template + void BILU0::setOpenCLContext(cl::Context *context){ this->context = context; } - void BILU0::setOpenCLQueue(cl::CommandQueue *queue){ + template + void BILU0::setOpenCLQueue(cl::CommandQueue *queue){ this->queue = queue; } - void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){ + template + void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){ this->work_group_size = work_group_size; this->total_work_items = total_work_items; this->lmem_per_work_group = lmem_per_work_group; } - void BILU0::setKernels( + template + void BILU0::setKernels( cl::make_kernel *ILU_apply1_, cl::make_kernel *ILU_apply2_ ){ @@ -353,6 +359,27 @@ namespace bda this->ILU_apply2 = ILU_apply2_; } + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template BILU0::BILU0(bool, bool, int); \ +template bool BILU0::init(BlockedMatrix*); \ +template bool BILU0::create_preconditioner(BlockedMatrix*); \ +template void BILU0::apply(cl::Buffer& x, cl::Buffer& y); \ +template void BILU0::setOpenCLContext(cl::Context*); \ +template void BILU0::setOpenCLQueue(cl::CommandQueue*); \ +template void BILU0::setKernelParameters(unsigned int, unsigned int, unsigned int); \ +template void BILU0::setKernels( \ + cl::make_kernel *, \ + cl::make_kernel * \ + ); + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + } // end namespace bda diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index 4589108a9..1bebab3ad 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -36,6 +36,7 @@ namespace bda /// This class implementa a Blocked ILU0 preconditioner /// The decomposition is done on CPU, and reorders the rows of the matrix + template class BILU0 { @@ -44,7 +45,6 @@ namespace bda int Nb; // number of blockrows of the matrix int nnz; // number of nonzeroes of the matrix (scalar) int nnzbs; // number of blocks of the matrix - unsigned int block_size; BlockedMatrix *LMat, *UMat, *LUMat; BlockedMatrix *rMat = nullptr; // only used with PAR_SIM Block *invDiagVals; @@ -79,7 +79,7 @@ namespace bda ~BILU0(); // analysis - bool init(BlockedMatrix *mat, unsigned int block_size); + bool init(BlockedMatrix *mat); // ilu_decomposition bool create_preconditioner(BlockedMatrix *mat); diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 134b07322..9ed43031f 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #define PRINT_TIMERS_BRIDGE 0 @@ -37,6 +38,7 @@ namespace Opm using bda::BdaResult; using bda::BdaSolver; + using bda::BdaSolverStatus; template BdaBridge::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance) @@ -44,14 +46,14 @@ BdaBridge::BdaBridge(std::string gpu_mod if (gpu_mode.compare("cusparse") == 0) { #if HAVE_CUDA use_gpu = true; - backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance)); + backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance)); #else OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake"); #endif } else if (gpu_mode.compare("opencl") == 0) { #if HAVE_OPENCL use_gpu = true; - backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance)); + backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance)); #else OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); #endif @@ -178,17 +180,17 @@ void BdaBridge::solve_system(BridgeMatri ///////////////////////// // actually solve - typedef BdaSolver::BdaSolverStatus BdaSolverStatus; + typedef BdaSolverStatus::Status Status; // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour - BdaSolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); + Status status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); switch(status) { - case BdaSolverStatus::BDA_SOLVER_SUCCESS: + case Status::BDA_SOLVER_SUCCESS: //OpmLog::info("BdaSolver converged"); break; - case BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED: + case Status::BDA_SOLVER_ANALYSIS_FAILED: OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal"); break; - case BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED: + case Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED: OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal"); break; default: @@ -238,6 +240,6 @@ INSTANTIATE_BDA_FUNCTIONS(4); #undef INSTANTIATE_BDA_FUNCTIONS -} +} // namespace Opm diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index cd5145746..706394ac5 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -47,7 +47,7 @@ template class BdaBridge { private: - std::unique_ptr backend; + std::unique_ptr > backend; bool use_gpu = false; public: diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index 8480c7d61..e812b5e34 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -26,15 +26,19 @@ #include #include +#include #include namespace bda { using Opm::WellContributions; + typedef BdaSolverStatus::Status Status; /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver /// This class is abstract, no instantiations can of it can be made, only of its children + /// Without a default block_size value, the BILU0 class cannot use BdaSolver::second() + template class BdaSolver { @@ -55,26 +59,18 @@ namespace bda int Nb; // number of blocked rows (Nb*block_size == N) int nnz; // number of nonzeroes (scalars) int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz) - int block_size; // size of block bool initialized = false; public: - enum class BdaSolverStatus { - BDA_SOLVER_SUCCESS, - BDA_SOLVER_ANALYSIS_FAILED, - BDA_SOLVER_CREATE_PRECONDITIONER_FAILED, - BDA_SOLVER_UNKNOWN_ERROR - }; - BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {}; /// Define virtual destructor, so that the derivedclass destructor will be called virtual ~BdaSolver() {}; /// Define as pure virtual functions, so derivedclass must implement them - virtual BdaSolverStatus solve_system(int N, int nnz, int dim, + virtual Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) = 0; diff --git a/opm/simulators/linalg/bda/BdaSolverStatus.hpp b/opm/simulators/linalg/bda/BdaSolverStatus.hpp new file mode 100644 index 000000000..9c2776871 --- /dev/null +++ b/opm/simulators/linalg/bda/BdaSolverStatus.hpp @@ -0,0 +1,42 @@ +/* + Copyright 2019 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_BDASOLVERSTATUS_HEADER_INCLUDED +#define OPM_BDASOLVERSTATUS_HEADER_INCLUDED + +namespace bda +{ + + class BdaSolverStatus + { + + public: + + enum class Status { + BDA_SOLVER_SUCCESS, + BDA_SOLVER_ANALYSIS_FAILED, + BDA_SOLVER_CREATE_PRECONDITIONER_FAILED, + BDA_SOLVER_UNKNOWN_ERROR + }; + + }; // end class BdaSolverStatus + +} // end namespace bda + +#endif diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cusparseSolverBackend.cu index 986afc9a2..53270ba1e 100644 --- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu +++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu @@ -51,13 +51,17 @@ const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW; -cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {} -cusparseSolverBackend::~cusparseSolverBackend() { +template +cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {} + +template +cusparseSolverBackend::~cusparseSolverBackend() { finalize(); } -void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { +template +void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { double t_total1, t_total2; int n = N; double rho = 1.0, rhop; @@ -188,10 +192,10 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaRe } -void cusparseSolverBackend::initialize(int N, int nnz, int dim) { +template +void cusparseSolverBackend::initialize(int N, int nnz, int dim) { this->N = N; this->nnz = nnz; - this->block_size = dim; this->nnzb = nnz / block_size / block_size; Nb = (N + dim - 1) / dim; std::ostringstream out; @@ -250,7 +254,8 @@ void cusparseSolverBackend::initialize(int N, int nnz, int dim) { initialized = true; } // end initialize() -void cusparseSolverBackend::finalize() { +template +void cusparseSolverBackend::finalize() { if (initialized) { cudaFree(d_x); cudaFree(d_b); @@ -283,7 +288,8 @@ void cusparseSolverBackend::finalize() { } // end finalize() -void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) { +template +void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) { double t1, t2; if (verbosity > 2) { @@ -318,7 +324,8 @@ void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *col // don't copy rowpointers and colindices, they stay the same -void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) { +template +void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) { double t1, t2; if (verbosity > 2) { @@ -350,12 +357,14 @@ void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double } // end update_system_on_gpu() -void cusparseSolverBackend::reset_prec_on_gpu() { +template +void cusparseSolverBackend::reset_prec_on_gpu() { cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); } -bool cusparseSolverBackend::analyse_matrix() { +template +bool cusparseSolverBackend::analyse_matrix() { int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; double t1, t2; @@ -436,7 +445,8 @@ bool cusparseSolverBackend::analyse_matrix() { return true; } // end analyse_matrix() -bool cusparseSolverBackend::create_preconditioner() { +template +bool cusparseSolverBackend::create_preconditioner() { double t1, t2; if (verbosity > 2) { @@ -468,7 +478,8 @@ bool cusparseSolverBackend::create_preconditioner() { } // end create_preconditioner() -void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { +template +void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { // actually solve gpu_pbicgstab(wellContribs, res); cudaStreamSynchronize(stream); @@ -478,7 +489,8 @@ void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaRes // copy result to host memory // caller must be sure that x is a valid array -void cusparseSolverBackend::get_result(double *x) { +template +void cusparseSolverBackend::get_result(double *x) { double t1, t2; if (verbosity > 2) { @@ -497,9 +509,10 @@ void cusparseSolverBackend::get_result(double *x) { } // end get_result() -typedef BdaSolver::BdaSolverStatus BdaSolverStatus; +typedef BdaSolverStatus::Status Status; -BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { +template +Status cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { initialize(N, nnz, dim); copy_system_to_gpu(vals, rows, cols, b); @@ -508,19 +521,29 @@ BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, dou } if (analysis_done == false) { if (!analyse_matrix()) { - return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED; + return Status::BDA_SOLVER_ANALYSIS_FAILED; } } reset_prec_on_gpu(); if (create_preconditioner()) { solve_system(wellContribs, res); } else { - return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } - return BdaSolverStatus::BDA_SOLVER_SUCCESS; + return Status::BDA_SOLVER_SUCCESS; } -} +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template cusparseSolverBackend::cusparseSolverBackend(int, int, double); \ + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} // namespace bda diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp index 51dcfee6a..aa8df2170 100644 --- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp +++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp @@ -32,7 +32,21 @@ namespace bda { /// This class implements a cusparse-based ilu0-bicgstab solver on GPU -class cusparseSolverBackend : public BdaSolver { +template +class cusparseSolverBackend : public BdaSolver { + + typedef BdaSolver Base; + + using Base::N; + using Base::Nb; + using Base::nnz; + using Base::nnzb; + using Base::verbosity; + using Base::maxit; + using Base::tolerance; + using Base::second; + using Base::initialized; + typedef BdaSolverStatus::Status Status; private: @@ -120,7 +134,7 @@ public: /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result /// \return status code - BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; /// Get resulting vector x after linear solve, also includes post processing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index ed2897770..ceca68fc0 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -51,31 +51,28 @@ namespace bda using Opm::OpmLog; -openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) { +template +openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) { prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_); } -openclSolverBackend::~openclSolverBackend() { +template +openclSolverBackend::~openclSolverBackend() { finalize(); } // divide A by B, and round up: return (int)ceil(A/B) -unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B) +template +unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B) { return A / B + (A % B > 0); } -// just for verifying and debugging -bool equal(float a, float b) -{ - const float tol_abs = 1e-2; - const float tol_rel = 1e-2; - return std::abs(a - b) <= std::max(tol_rel * std::max(std::abs(a), std::abs(b)), tol_abs); -} -double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) +template +double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) { double t1 = 0.0, t2 = 0.0; const unsigned int work_group_size = 1024; @@ -106,7 +103,8 @@ double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out return gpu_sum; } -double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) +template +double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) { double t1 = 0.0, t2 = 0.0; const unsigned int work_group_size = 1024; @@ -138,7 +136,8 @@ double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) return gpu_norm; } -void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out) +template +void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out) { double t1 = 0.0, t2 = 0.0; const unsigned int work_group_size = 32; @@ -159,7 +158,8 @@ void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out) } } -void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta) +template +void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta) { double t1 = 0.0, t2 = 0.0; const unsigned int work_group_size = 32; @@ -180,7 +180,8 @@ void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, con } } -void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b) +template +void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b) { double t1 = 0.0, t2 = 0.0; const unsigned int work_group_size = 32; @@ -203,7 +204,8 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::B } -void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { +template +void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { float it; double rho, rhop, beta, alpha, omega, tmp1, tmp2; @@ -360,10 +362,10 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResu } -void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { +template +void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { this->N = N_; this->nnz = nnz_; - this->block_size = dim; this->nnzb = nnz_ / block_size / block_size; Nb = (N + dim - 1) / dim; @@ -542,7 +544,9 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, in initialized = true; } // end initialize() -void openclSolverBackend::finalize() { + +template +void openclSolverBackend::finalize() { delete[] rb; delete[] tmp; #if COPY_ROW_BY_ROW @@ -551,7 +555,8 @@ void openclSolverBackend::finalize() { } // end finalize() -void openclSolverBackend::copy_system_to_gpu() { +template +void openclSolverBackend::copy_system_to_gpu() { double t1 = 0.0, t2 = 0.0; if (verbosity > 2) { @@ -588,7 +593,8 @@ void openclSolverBackend::copy_system_to_gpu() { // don't copy rowpointers and colindices, they stay the same -void openclSolverBackend::update_system_on_gpu() { +template +void openclSolverBackend::update_system_on_gpu() { double t1 = 0.0, t2 = 0.0; if (verbosity > 2) { @@ -622,7 +628,8 @@ void openclSolverBackend::update_system_on_gpu() { } // end update_system_on_gpu() -bool openclSolverBackend::analyse_matrix() { +template +bool openclSolverBackend::analyse_matrix() { double t1 = 0.0, t2 = 0.0; @@ -630,7 +637,7 @@ bool openclSolverBackend::analyse_matrix() { t1 = second(); } - bool success = prec->init(mat, block_size); + bool success = prec->init(mat); int work_group_size = 32; int num_work_groups = ceilDivision(N, work_group_size); int total_work_items = num_work_groups * work_group_size; @@ -654,7 +661,8 @@ bool openclSolverBackend::analyse_matrix() { } // end analyse_matrix() -void openclSolverBackend::update_system(double *vals, double *b) { +template +void openclSolverBackend::update_system(double *vals, double *b) { double t1 = 0.0, t2 = 0.0; if (verbosity > 2) { t1 = second(); @@ -673,7 +681,8 @@ void openclSolverBackend::update_system(double *vals, double *b) { } // end update_system() -bool openclSolverBackend::create_preconditioner() { +template +bool openclSolverBackend::create_preconditioner() { double t1 = 0.0, t2 = 0.0; if (verbosity > 2) { @@ -692,7 +701,8 @@ bool openclSolverBackend::create_preconditioner() { } // end create_preconditioner() -void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { +template +void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { // actually solve double t1 = 0.0, t2 = 0.0; if (verbosity > 2) { @@ -713,7 +723,8 @@ void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResul // copy result to host memory // caller must be sure that x is a valid array -void openclSolverBackend::get_result(double *x) { +template +void openclSolverBackend::get_result(double *x) { double t1 = 0.0, t2 = 0.0; if (verbosity > 2) { @@ -732,32 +743,43 @@ void openclSolverBackend::get_result(double *x) { } // end get_result() +typedef BdaSolverStatus::Status Status; -typedef BdaSolver::BdaSolverStatus BdaSolverStatus; - -BdaSolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { +template +Status openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { initialize(N_, nnz_, dim, vals, rows, cols); if (analysis_done == false) { if (!analyse_matrix()) { - return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED; + return Status::BDA_SOLVER_ANALYSIS_FAILED; } } update_system(vals, b); if (!create_preconditioner()) { - return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } copy_system_to_gpu(); } else { update_system(vals, b); if (!create_preconditioner()) { - return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } update_system_on_gpu(); } solve_system(wellContribs, res); - return BdaSolverStatus::BDA_SOLVER_SUCCESS; + return Status::BDA_SOLVER_SUCCESS; } -} + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template openclSolverBackend::openclSolverBackend(int, int, double); \ + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} // namespace bda diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index f6be94678..4ea70607a 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -31,15 +31,29 @@ #include #include -typedef bda::BILU0 Preconditioner; namespace bda { /// This class implements a opencl-based ilu0-bicgstab solver on GPU -class openclSolverBackend : public BdaSolver +template +class openclSolverBackend : public BdaSolver { + typedef BdaSolver Base; + typedef BILU0 Preconditioner; + + using Base::N; + using Base::Nb; + using Base::nnz; + using Base::nnzb; + using Base::verbosity; + using Base::maxit; + using Base::tolerance; + using Base::second; + using Base::initialized; + typedef BdaSolverStatus::Status Status; + private: double *rb; // reordered b vector, the matrix is reordered, so b must also be @@ -182,7 +196,7 @@ public: /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result /// \return status code - BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; /// Get result after linear solve, and peform postprocessing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array