Added block_size template to BdaSolvers and BILU0

This commit is contained in:
T.D. (Tongdong) Qiu 2020-06-24 20:09:14 +02:00
parent 2a48f5f63f
commit 98ddf47b44
11 changed files with 232 additions and 91 deletions

View File

@ -150,6 +150,7 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/BdaBridge.hpp opm/simulators/linalg/bda/BdaBridge.hpp
opm/simulators/linalg/bda/BdaResult.hpp opm/simulators/linalg/bda/BdaResult.hpp
opm/simulators/linalg/bda/BdaSolver.hpp opm/simulators/linalg/bda/BdaSolver.hpp
opm/simulators/linalg/bda/BdaSolverStatus.hpp
opm/simulators/linalg/bda/BILU0.hpp opm/simulators/linalg/bda/BILU0.hpp
opm/simulators/linalg/bda/BlockedMatrix.hpp opm/simulators/linalg/bda/BlockedMatrix.hpp
opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cuda_header.hpp

View File

@ -35,12 +35,12 @@ namespace bda
using Opm::OpmLog; using Opm::OpmLog;
BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
// define 'second' as 'BdaSolver<>::second', this allows usage of the second() function for timing // define 'second' as 'BdaSolver<>::second', this allows usage of the second() function for timing
// typedefs cannot handle templates // typedefs cannot handle templates
const auto second = BdaSolver<>::second; const auto second = BdaSolver<>::second;
template <unsigned int block_size>
BILU0<block_size>::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_) level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_)
{ {
if (level_scheduling == graph_coloring) { if (level_scheduling == graph_coloring) {
@ -49,7 +49,8 @@ namespace bda
double t1 = second(); double t1 = second();
} }
BILU0::~BILU0() template <unsigned int block_size>
BILU0<block_size>::~BILU0()
{ {
delete[] invDiagVals; delete[] invDiagVals;
delete[] diagIndex; delete[] diagIndex;
@ -62,7 +63,8 @@ namespace bda
freeBlockedMatrix(&rMat); freeBlockedMatrix(&rMat);
} }
bool BILU0::init(BlockedMatrix *mat, unsigned int block_size) template <unsigned int block_size>
bool BILU0<block_size>::init(BlockedMatrix *mat)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
BlockedMatrix *CSCmat = nullptr; BlockedMatrix *CSCmat = nullptr;
@ -71,7 +73,6 @@ namespace bda
this->Nb = mat->Nb; this->Nb = mat->Nb;
this->nnz = mat->nnzbs * block_size * block_size; this->nnz = mat->nnzbs * block_size * block_size;
this->nnzbs = mat->nnzbs; this->nnzbs = mat->nnzbs;
this->block_size = block_size;
toOrder = new int[N]; toOrder = new int[N];
fromOrder = new int[N]; fromOrder = new int[N];
@ -160,7 +161,8 @@ namespace bda
} // end init() } // end init()
bool BILU0::create_preconditioner(BlockedMatrix *mat) template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity >= 3){ if (verbosity >= 3){
@ -306,13 +308,13 @@ namespace bda
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
// however, if individual kernel calls are timed, waiting for events is needed // however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested // behavior on other GPUs is untested
void BILU0::apply(cl::Buffer& x, cl::Buffer& y) template <unsigned int block_size>
void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity >= 3) { if (verbosity >= 3) {
t1 = second(); t1 = second();
} }
const unsigned int block_size = 3;
cl::Event event; cl::Event event;
for(unsigned int color = 0; color < numColors; ++color){ for(unsigned int color = 0; color < numColors; ++color){
@ -334,18 +336,22 @@ namespace bda
} }
void BILU0::setOpenCLContext(cl::Context *context){ template <unsigned int block_size>
void BILU0<block_size>::setOpenCLContext(cl::Context *context){
this->context = context; this->context = context;
} }
void BILU0::setOpenCLQueue(cl::CommandQueue *queue){ template <unsigned int block_size>
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue){
this->queue = queue; this->queue = queue;
} }
void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){ template <unsigned int block_size>
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
this->work_group_size = work_group_size; this->work_group_size = work_group_size;
this->total_work_items = total_work_items; this->total_work_items = total_work_items;
this->lmem_per_work_group = lmem_per_work_group; this->lmem_per_work_group = lmem_per_work_group;
} }
void BILU0::setKernels( template <unsigned int block_size>
void BILU0<block_size>::setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_, cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_ cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
){ ){
@ -353,6 +359,27 @@ namespace bda
this->ILU_apply2 = ILU_apply2_; this->ILU_apply2 = ILU_apply2_;
} }
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(bool, bool, int); \
template bool BILU0<n>::init(BlockedMatrix*); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix*); \
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
template void BILU0<n>::setKernels( \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> * \
);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} // end namespace bda } // end namespace bda

View File

@ -36,6 +36,7 @@ namespace bda
/// This class implementa a Blocked ILU0 preconditioner /// This class implementa a Blocked ILU0 preconditioner
/// The decomposition is done on CPU, and reorders the rows of the matrix /// The decomposition is done on CPU, and reorders the rows of the matrix
template <unsigned int block_size>
class BILU0 class BILU0
{ {
@ -44,7 +45,6 @@ namespace bda
int Nb; // number of blockrows of the matrix int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar) int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix int nnzbs; // number of blocks of the matrix
unsigned int block_size;
BlockedMatrix *LMat, *UMat, *LUMat; BlockedMatrix *LMat, *UMat, *LUMat;
BlockedMatrix *rMat = nullptr; // only used with PAR_SIM BlockedMatrix *rMat = nullptr; // only used with PAR_SIM
Block *invDiagVals; Block *invDiagVals;
@ -79,7 +79,7 @@ namespace bda
~BILU0(); ~BILU0();
// analysis // analysis
bool init(BlockedMatrix *mat, unsigned int block_size); bool init(BlockedMatrix *mat);
// ilu_decomposition // ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat); bool create_preconditioner(BlockedMatrix *mat);

View File

@ -26,6 +26,7 @@
#include <opm/material/common/Unused.hpp> #include <opm/material/common/Unused.hpp>
#include <opm/simulators/linalg/bda/BdaBridge.hpp> #include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#define PRINT_TIMERS_BRIDGE 0 #define PRINT_TIMERS_BRIDGE 0
@ -37,6 +38,7 @@ namespace Opm
using bda::BdaResult; using bda::BdaResult;
using bda::BdaSolver; using bda::BdaSolver;
using bda::BdaSolverStatus;
template <class BridgeMatrix, class BridgeVector, int block_size> template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance) BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance)
@ -44,14 +46,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
if (gpu_mode.compare("cusparse") == 0) { if (gpu_mode.compare("cusparse") == 0) {
#if HAVE_CUDA #if HAVE_CUDA
use_gpu = true; use_gpu = true;
backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance)); backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
#else #else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake"); OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif #endif
} else if (gpu_mode.compare("opencl") == 0) { } else if (gpu_mode.compare("opencl") == 0) {
#if HAVE_OPENCL #if HAVE_OPENCL
use_gpu = true; use_gpu = true;
backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance)); backend.reset(new bda::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
#else #else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif #endif
@ -178,17 +180,17 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
///////////////////////// /////////////////////////
// actually solve // actually solve
typedef BdaSolver::BdaSolverStatus BdaSolverStatus; typedef BdaSolverStatus::Status Status;
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
BdaSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result); Status status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
switch(status) { switch(status) {
case BdaSolverStatus::BDA_SOLVER_SUCCESS: case Status::BDA_SOLVER_SUCCESS:
//OpmLog::info("BdaSolver converged"); //OpmLog::info("BdaSolver converged");
break; break;
case BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED: case Status::BDA_SOLVER_ANALYSIS_FAILED:
OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal"); OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break; break;
case BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED: case Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal"); OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break; break;
default: default:
@ -238,6 +240,6 @@ INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS #undef INSTANTIATE_BDA_FUNCTIONS
} } // namespace Opm

View File

@ -47,7 +47,7 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
class BdaBridge class BdaBridge
{ {
private: private:
std::unique_ptr<bda::BdaSolver> backend; std::unique_ptr<bda::BdaSolver<block_size> > backend;
bool use_gpu = false; bool use_gpu = false;
public: public:

View File

@ -26,15 +26,19 @@
#include <sys/time.h> #include <sys/time.h>
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp> #include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace bda namespace bda
{ {
using Opm::WellContributions; using Opm::WellContributions;
typedef BdaSolverStatus::Status Status;
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
/// This class is abstract, no instantiations can of it can be made, only of its children /// This class is abstract, no instantiations can of it can be made, only of its children
/// Without a default block_size value, the BILU0 class cannot use BdaSolver::second()
template <unsigned int block_size = 3>
class BdaSolver class BdaSolver
{ {
@ -55,26 +59,18 @@ namespace bda
int Nb; // number of blocked rows (Nb*block_size == N) int Nb; // number of blocked rows (Nb*block_size == N)
int nnz; // number of nonzeroes (scalars) int nnz; // number of nonzeroes (scalars)
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz) int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
int block_size; // size of block
bool initialized = false; bool initialized = false;
public: public:
enum class BdaSolverStatus {
BDA_SOLVER_SUCCESS,
BDA_SOLVER_ANALYSIS_FAILED,
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
BDA_SOLVER_UNKNOWN_ERROR
};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
/// Define virtual destructor, so that the derivedclass destructor will be called /// Define virtual destructor, so that the derivedclass destructor will be called
virtual ~BdaSolver() {}; virtual ~BdaSolver() {};
/// Define as pure virtual functions, so derivedclass must implement them /// Define as pure virtual functions, so derivedclass must implement them
virtual BdaSolverStatus solve_system(int N, int nnz, int dim, virtual Status solve_system(int N, int nnz, int dim,
double *vals, int *rows, int *cols, double *vals, int *rows, int *cols,
double *b, WellContributions& wellContribs, BdaResult &res) = 0; double *b, WellContributions& wellContribs, BdaResult &res) = 0;

View File

@ -0,0 +1,42 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_BDASOLVERSTATUS_HEADER_INCLUDED
#define OPM_BDASOLVERSTATUS_HEADER_INCLUDED
namespace bda
{
class BdaSolverStatus
{
public:
enum class Status {
BDA_SOLVER_SUCCESS,
BDA_SOLVER_ANALYSIS_FAILED,
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
BDA_SOLVER_UNKNOWN_ERROR
};
}; // end class BdaSolverStatus
} // end namespace bda
#endif

View File

@ -51,13 +51,17 @@ const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW; const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {}
cusparseSolverBackend::~cusparseSolverBackend() { template <unsigned int block_size>
cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {}
template <unsigned int block_size>
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
finalize(); finalize();
} }
void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
double t_total1, t_total2; double t_total1, t_total2;
int n = N; int n = N;
double rho = 1.0, rhop; double rho = 1.0, rhop;
@ -188,10 +192,10 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaRe
} }
void cusparseSolverBackend::initialize(int N, int nnz, int dim) { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
this->N = N; this->N = N;
this->nnz = nnz; this->nnz = nnz;
this->block_size = dim;
this->nnzb = nnz / block_size / block_size; this->nnzb = nnz / block_size / block_size;
Nb = (N + dim - 1) / dim; Nb = (N + dim - 1) / dim;
std::ostringstream out; std::ostringstream out;
@ -250,7 +254,8 @@ void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
initialized = true; initialized = true;
} // end initialize() } // end initialize()
void cusparseSolverBackend::finalize() { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::finalize() {
if (initialized) { if (initialized) {
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_b); cudaFree(d_b);
@ -283,7 +288,8 @@ void cusparseSolverBackend::finalize() {
} // end finalize() } // end finalize()
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
double t1, t2; double t1, t2;
if (verbosity > 2) { if (verbosity > 2) {
@ -318,7 +324,8 @@ void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *col
// don't copy rowpointers and colindices, they stay the same // don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
double t1, t2; double t1, t2;
if (verbosity > 2) { if (verbosity > 2) {
@ -350,12 +357,14 @@ void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double
} // end update_system_on_gpu() } // end update_system_on_gpu()
void cusparseSolverBackend::reset_prec_on_gpu() { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
} }
bool cusparseSolverBackend::analyse_matrix() { template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::analyse_matrix() {
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
double t1, t2; double t1, t2;
@ -436,7 +445,8 @@ bool cusparseSolverBackend::analyse_matrix() {
return true; return true;
} // end analyse_matrix() } // end analyse_matrix()
bool cusparseSolverBackend::create_preconditioner() { template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() {
double t1, t2; double t1, t2;
if (verbosity > 2) { if (verbosity > 2) {
@ -468,7 +478,8 @@ bool cusparseSolverBackend::create_preconditioner() {
} // end create_preconditioner() } // end create_preconditioner()
void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve // actually solve
gpu_pbicgstab(wellContribs, res); gpu_pbicgstab(wellContribs, res);
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
@ -478,7 +489,8 @@ void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaRes
// copy result to host memory // copy result to host memory
// caller must be sure that x is a valid array // caller must be sure that x is a valid array
void cusparseSolverBackend::get_result(double *x) { template <unsigned int block_size>
void cusparseSolverBackend<block_size>::get_result(double *x) {
double t1, t2; double t1, t2;
if (verbosity > 2) { if (verbosity > 2) {
@ -497,9 +509,10 @@ void cusparseSolverBackend::get_result(double *x) {
} // end get_result() } // end get_result()
typedef BdaSolver::BdaSolverStatus BdaSolverStatus; typedef BdaSolverStatus::Status Status;
BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { template <unsigned int block_size>
Status cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) { if (initialized == false) {
initialize(N, nnz, dim); initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b); copy_system_to_gpu(vals, rows, cols, b);
@ -508,19 +521,29 @@ BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, dou
} }
if (analysis_done == false) { if (analysis_done == false) {
if (!analyse_matrix()) { if (!analyse_matrix()) {
return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED; return Status::BDA_SOLVER_ANALYSIS_FAILED;
} }
} }
reset_prec_on_gpu(); reset_prec_on_gpu();
if (create_preconditioner()) { if (create_preconditioner()) {
solve_system(wellContribs, res); solve_system(wellContribs, res);
} else { } else {
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
} }
return BdaSolverStatus::BDA_SOLVER_SUCCESS; return Status::BDA_SOLVER_SUCCESS;
} }
} #define INSTANTIATE_BDA_FUNCTIONS(n) \
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda

View File

@ -32,7 +32,21 @@ namespace bda
{ {
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
class cusparseSolverBackend : public BdaSolver { template <unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<block_size> {
typedef BdaSolver<block_size> Base;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::maxit;
using Base::tolerance;
using Base::second;
using Base::initialized;
typedef BdaSolverStatus::Status Status;
private: private:
@ -120,7 +134,7 @@ public:
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
/// \return status code /// \return status code
BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
/// Get resulting vector x after linear solve, also includes post processing if necessary /// Get resulting vector x after linear solve, also includes post processing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array

View File

@ -51,31 +51,28 @@ namespace bda
using Opm::OpmLog; using Opm::OpmLog;
openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) { template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_); prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
} }
openclSolverBackend::~openclSolverBackend() { template <unsigned int block_size>
openclSolverBackend<block_size>::~openclSolverBackend() {
finalize(); finalize();
} }
// divide A by B, and round up: return (int)ceil(A/B) // divide A by B, and round up: return (int)ceil(A/B)
unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B) template <unsigned int block_size>
unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
{ {
return A / B + (A % B > 0); return A / B + (A % B > 0);
} }
// just for verifying and debugging
bool equal(float a, float b)
{
const float tol_abs = 1e-2;
const float tol_rel = 1e-2;
return std::abs(a - b) <= std::max(tol_rel * std::max(std::abs(a), std::abs(b)), tol_abs);
}
double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) template <unsigned int block_size>
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 1024; const unsigned int work_group_size = 1024;
@ -106,7 +103,8 @@ double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out
return gpu_sum; return gpu_sum;
} }
double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) template <unsigned int block_size>
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 1024; const unsigned int work_group_size = 1024;
@ -138,7 +136,8 @@ double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
return gpu_norm; return gpu_norm;
} }
void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out) template <unsigned int block_size>
void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
@ -159,7 +158,8 @@ void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
} }
} }
void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta) template <unsigned int block_size>
void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
@ -180,7 +180,8 @@ void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, con
} }
} }
void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b) template <unsigned int block_size>
void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
{ {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
@ -203,7 +204,8 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::B
} }
void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
float it; float it;
double rho, rhop, beta, alpha, omega, tmp1, tmp2; double rho, rhop, beta, alpha, omega, tmp1, tmp2;
@ -360,10 +362,10 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResu
} }
void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
this->N = N_; this->N = N_;
this->nnz = nnz_; this->nnz = nnz_;
this->block_size = dim;
this->nnzb = nnz_ / block_size / block_size; this->nnzb = nnz_ / block_size / block_size;
Nb = (N + dim - 1) / dim; Nb = (N + dim - 1) / dim;
@ -542,7 +544,9 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, in
initialized = true; initialized = true;
} // end initialize() } // end initialize()
void openclSolverBackend::finalize() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
delete[] rb; delete[] rb;
delete[] tmp; delete[] tmp;
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
@ -551,7 +555,8 @@ void openclSolverBackend::finalize() {
} // end finalize() } // end finalize()
void openclSolverBackend::copy_system_to_gpu() { template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) { if (verbosity > 2) {
@ -588,7 +593,8 @@ void openclSolverBackend::copy_system_to_gpu() {
// don't copy rowpointers and colindices, they stay the same // don't copy rowpointers and colindices, they stay the same
void openclSolverBackend::update_system_on_gpu() { template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) { if (verbosity > 2) {
@ -622,7 +628,8 @@ void openclSolverBackend::update_system_on_gpu() {
} // end update_system_on_gpu() } // end update_system_on_gpu()
bool openclSolverBackend::analyse_matrix() { template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyse_matrix() {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
@ -630,7 +637,7 @@ bool openclSolverBackend::analyse_matrix() {
t1 = second(); t1 = second();
} }
bool success = prec->init(mat, block_size); bool success = prec->init(mat);
int work_group_size = 32; int work_group_size = 32;
int num_work_groups = ceilDivision(N, work_group_size); int num_work_groups = ceilDivision(N, work_group_size);
int total_work_items = num_work_groups * work_group_size; int total_work_items = num_work_groups * work_group_size;
@ -654,7 +661,8 @@ bool openclSolverBackend::analyse_matrix() {
} // end analyse_matrix() } // end analyse_matrix()
void openclSolverBackend::update_system(double *vals, double *b) { template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) { if (verbosity > 2) {
t1 = second(); t1 = second();
@ -673,7 +681,8 @@ void openclSolverBackend::update_system(double *vals, double *b) {
} // end update_system() } // end update_system()
bool openclSolverBackend::create_preconditioner() { template <unsigned int block_size>
bool openclSolverBackend<block_size>::create_preconditioner() {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) { if (verbosity > 2) {
@ -692,7 +701,8 @@ bool openclSolverBackend::create_preconditioner() {
} // end create_preconditioner() } // end create_preconditioner()
void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { template <unsigned int block_size>
void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve // actually solve
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) { if (verbosity > 2) {
@ -713,7 +723,8 @@ void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResul
// copy result to host memory // copy result to host memory
// caller must be sure that x is a valid array // caller must be sure that x is a valid array
void openclSolverBackend::get_result(double *x) { template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) {
double t1 = 0.0, t2 = 0.0; double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) { if (verbosity > 2) {
@ -732,32 +743,43 @@ void openclSolverBackend::get_result(double *x) {
} // end get_result() } // end get_result()
typedef BdaSolverStatus::Status Status;
typedef BdaSolver::BdaSolverStatus BdaSolverStatus; template <unsigned int block_size>
Status openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
BdaSolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) { if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols); initialize(N_, nnz_, dim, vals, rows, cols);
if (analysis_done == false) { if (analysis_done == false) {
if (!analyse_matrix()) { if (!analyse_matrix()) {
return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED; return Status::BDA_SOLVER_ANALYSIS_FAILED;
} }
} }
update_system(vals, b); update_system(vals, b);
if (!create_preconditioner()) { if (!create_preconditioner()) {
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
} }
copy_system_to_gpu(); copy_system_to_gpu();
} else { } else {
update_system(vals, b); update_system(vals, b);
if (!create_preconditioner()) { if (!create_preconditioner()) {
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
} }
update_system_on_gpu(); update_system_on_gpu();
} }
solve_system(wellContribs, res); solve_system(wellContribs, res);
return BdaSolverStatus::BDA_SOLVER_SUCCESS; return Status::BDA_SOLVER_SUCCESS;
} }
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend(int, int, double); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda

View File

@ -31,15 +31,29 @@
#include <opm/simulators/linalg/bda/WellContributions.hpp> #include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp> #include <opm/simulators/linalg/bda/BILU0.hpp>
typedef bda::BILU0 Preconditioner;
namespace bda namespace bda
{ {
/// This class implements a opencl-based ilu0-bicgstab solver on GPU /// This class implements a opencl-based ilu0-bicgstab solver on GPU
class openclSolverBackend : public BdaSolver template <unsigned int block_size>
class openclSolverBackend : public BdaSolver<block_size>
{ {
typedef BdaSolver<block_size> Base;
typedef BILU0<block_size> Preconditioner;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::maxit;
using Base::tolerance;
using Base::second;
using Base::initialized;
typedef BdaSolverStatus::Status Status;
private: private:
double *rb; // reordered b vector, the matrix is reordered, so b must also be double *rb; // reordered b vector, the matrix is reordered, so b must also be
@ -182,7 +196,7 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
/// \return status code /// \return status code
BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
/// Get result after linear solve, and peform postprocessing if necessary /// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array