Added block_size template to BdaSolvers and BILU0

This commit is contained in:
T.D. (Tongdong) Qiu 2020-06-24 20:09:14 +02:00
parent 2a48f5f63f
commit 98ddf47b44
11 changed files with 232 additions and 91 deletions

View File

@ -150,6 +150,7 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/BdaBridge.hpp
opm/simulators/linalg/bda/BdaResult.hpp
opm/simulators/linalg/bda/BdaSolver.hpp
opm/simulators/linalg/bda/BdaSolverStatus.hpp
opm/simulators/linalg/bda/BILU0.hpp
opm/simulators/linalg/bda/BlockedMatrix.hpp
opm/simulators/linalg/bda/cuda_header.hpp

View File

@ -35,12 +35,12 @@ namespace bda
using Opm::OpmLog;
BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
// define 'second' as 'BdaSolver<>::second', this allows usage of the second() function for timing
// typedefs cannot handle templates
const auto second = BdaSolver<>::second;
template <unsigned int block_size>
BILU0<block_size>::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_)
{
if (level_scheduling == graph_coloring) {
@ -49,7 +49,8 @@ namespace bda
double t1 = second();
}
BILU0::~BILU0()
template <unsigned int block_size>
BILU0<block_size>::~BILU0()
{
delete[] invDiagVals;
delete[] diagIndex;
@ -62,7 +63,8 @@ namespace bda
freeBlockedMatrix(&rMat);
}
bool BILU0::init(BlockedMatrix *mat, unsigned int block_size)
template <unsigned int block_size>
bool BILU0<block_size>::init(BlockedMatrix *mat)
{
double t1 = 0.0, t2 = 0.0;
BlockedMatrix *CSCmat = nullptr;
@ -71,7 +73,6 @@ namespace bda
this->Nb = mat->Nb;
this->nnz = mat->nnzbs * block_size * block_size;
this->nnzbs = mat->nnzbs;
this->block_size = block_size;
toOrder = new int[N];
fromOrder = new int[N];
@ -160,7 +161,8 @@ namespace bda
} // end init()
bool BILU0::create_preconditioner(BlockedMatrix *mat)
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
{
double t1 = 0.0, t2 = 0.0;
if (verbosity >= 3){
@ -306,13 +308,13 @@ namespace bda
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
void BILU0::apply(cl::Buffer& x, cl::Buffer& y)
template <unsigned int block_size>
void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
{
double t1 = 0.0, t2 = 0.0;
if (verbosity >= 3) {
t1 = second();
}
const unsigned int block_size = 3;
cl::Event event;
for(unsigned int color = 0; color < numColors; ++color){
@ -334,18 +336,22 @@ namespace bda
}
void BILU0::setOpenCLContext(cl::Context *context){
template <unsigned int block_size>
void BILU0<block_size>::setOpenCLContext(cl::Context *context){
this->context = context;
}
void BILU0::setOpenCLQueue(cl::CommandQueue *queue){
template <unsigned int block_size>
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue){
this->queue = queue;
}
void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
template <unsigned int block_size>
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
this->work_group_size = work_group_size;
this->total_work_items = total_work_items;
this->lmem_per_work_group = lmem_per_work_group;
}
void BILU0::setKernels(
template <unsigned int block_size>
void BILU0<block_size>::setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
){
@ -353,6 +359,27 @@ namespace bda
this->ILU_apply2 = ILU_apply2_;
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(bool, bool, int); \
template bool BILU0<n>::init(BlockedMatrix*); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix*); \
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
template void BILU0<n>::setKernels( \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> * \
);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} // end namespace bda

View File

@ -36,6 +36,7 @@ namespace bda
/// This class implementa a Blocked ILU0 preconditioner
/// The decomposition is done on CPU, and reorders the rows of the matrix
template <unsigned int block_size>
class BILU0
{
@ -44,7 +45,6 @@ namespace bda
int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix
unsigned int block_size;
BlockedMatrix *LMat, *UMat, *LUMat;
BlockedMatrix *rMat = nullptr; // only used with PAR_SIM
Block *invDiagVals;
@ -79,7 +79,7 @@ namespace bda
~BILU0();
// analysis
bool init(BlockedMatrix *mat, unsigned int block_size);
bool init(BlockedMatrix *mat);
// ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat);

View File

@ -26,6 +26,7 @@
#include <opm/material/common/Unused.hpp>
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#define PRINT_TIMERS_BRIDGE 0
@ -37,6 +38,7 @@ namespace Opm
using bda::BdaResult;
using bda::BdaSolver;
using bda::BdaSolverStatus;
template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance)
@ -44,14 +46,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
if (gpu_mode.compare("cusparse") == 0) {
#if HAVE_CUDA
use_gpu = true;
backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
#else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif
} else if (gpu_mode.compare("opencl") == 0) {
#if HAVE_OPENCL
use_gpu = true;
backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance));
backend.reset(new bda::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
@ -178,17 +180,17 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
/////////////////////////
// actually solve
typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
typedef BdaSolverStatus::Status Status;
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
BdaSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
Status status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
switch(status) {
case BdaSolverStatus::BDA_SOLVER_SUCCESS:
case Status::BDA_SOLVER_SUCCESS:
//OpmLog::info("BdaSolver converged");
break;
case BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
case Status::BDA_SOLVER_ANALYSIS_FAILED:
OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break;
case BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
case Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break;
default:
@ -238,6 +240,6 @@ INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
}
} // namespace Opm

View File

@ -47,7 +47,7 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
class BdaBridge
{
private:
std::unique_ptr<bda::BdaSolver> backend;
std::unique_ptr<bda::BdaSolver<block_size> > backend;
bool use_gpu = false;
public:

View File

@ -26,15 +26,19 @@
#include <sys/time.h>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace bda
{
using Opm::WellContributions;
typedef BdaSolverStatus::Status Status;
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
/// This class is abstract, no instantiations can of it can be made, only of its children
/// Without a default block_size value, the BILU0 class cannot use BdaSolver::second()
template <unsigned int block_size = 3>
class BdaSolver
{
@ -55,26 +59,18 @@ namespace bda
int Nb; // number of blocked rows (Nb*block_size == N)
int nnz; // number of nonzeroes (scalars)
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
int block_size; // size of block
bool initialized = false;
public:
enum class BdaSolverStatus {
BDA_SOLVER_SUCCESS,
BDA_SOLVER_ANALYSIS_FAILED,
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
BDA_SOLVER_UNKNOWN_ERROR
};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
/// Define virtual destructor, so that the derivedclass destructor will be called
virtual ~BdaSolver() {};
/// Define as pure virtual functions, so derivedclass must implement them
virtual BdaSolverStatus solve_system(int N, int nnz, int dim,
virtual Status solve_system(int N, int nnz, int dim,
double *vals, int *rows, int *cols,
double *b, WellContributions& wellContribs, BdaResult &res) = 0;

View File

@ -0,0 +1,42 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_BDASOLVERSTATUS_HEADER_INCLUDED
#define OPM_BDASOLVERSTATUS_HEADER_INCLUDED
namespace bda
{
class BdaSolverStatus
{
public:
enum class Status {
BDA_SOLVER_SUCCESS,
BDA_SOLVER_ANALYSIS_FAILED,
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
BDA_SOLVER_UNKNOWN_ERROR
};
}; // end class BdaSolverStatus
} // end namespace bda
#endif

View File

@ -51,13 +51,17 @@ const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {}
cusparseSolverBackend::~cusparseSolverBackend() {
template <unsigned int block_size>
cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {}
template <unsigned int block_size>
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
finalize();
}
void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
double t_total1, t_total2;
int n = N;
double rho = 1.0, rhop;
@ -188,10 +192,10 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaRe
}
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
this->N = N;
this->nnz = nnz;
this->block_size = dim;
this->nnzb = nnz / block_size / block_size;
Nb = (N + dim - 1) / dim;
std::ostringstream out;
@ -250,7 +254,8 @@ void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
initialized = true;
} // end initialize()
void cusparseSolverBackend::finalize() {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::finalize() {
if (initialized) {
cudaFree(d_x);
cudaFree(d_b);
@ -283,7 +288,8 @@ void cusparseSolverBackend::finalize() {
} // end finalize()
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
double t1, t2;
if (verbosity > 2) {
@ -318,7 +324,8 @@ void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *col
// don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
double t1, t2;
if (verbosity > 2) {
@ -350,12 +357,14 @@ void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double
} // end update_system_on_gpu()
void cusparseSolverBackend::reset_prec_on_gpu() {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
}
bool cusparseSolverBackend::analyse_matrix() {
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::analyse_matrix() {
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
double t1, t2;
@ -436,7 +445,8 @@ bool cusparseSolverBackend::analyse_matrix() {
return true;
} // end analyse_matrix()
bool cusparseSolverBackend::create_preconditioner() {
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() {
double t1, t2;
if (verbosity > 2) {
@ -468,7 +478,8 @@ bool cusparseSolverBackend::create_preconditioner() {
} // end create_preconditioner()
void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve
gpu_pbicgstab(wellContribs, res);
cudaStreamSynchronize(stream);
@ -478,7 +489,8 @@ void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaRes
// copy result to host memory
// caller must be sure that x is a valid array
void cusparseSolverBackend::get_result(double *x) {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::get_result(double *x) {
double t1, t2;
if (verbosity > 2) {
@ -497,9 +509,10 @@ void cusparseSolverBackend::get_result(double *x) {
} // end get_result()
typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
typedef BdaSolverStatus::Status Status;
BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
template <unsigned int block_size>
Status cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) {
initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b);
@ -508,19 +521,29 @@ BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, dou
}
if (analysis_done == false) {
if (!analyse_matrix()) {
return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
return Status::BDA_SOLVER_ANALYSIS_FAILED;
}
}
reset_prec_on_gpu();
if (create_preconditioner()) {
solve_system(wellContribs, res);
} else {
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
return BdaSolverStatus::BDA_SOLVER_SUCCESS;
return Status::BDA_SOLVER_SUCCESS;
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda

View File

@ -32,7 +32,21 @@ namespace bda
{
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
class cusparseSolverBackend : public BdaSolver {
template <unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<block_size> {
typedef BdaSolver<block_size> Base;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::maxit;
using Base::tolerance;
using Base::second;
using Base::initialized;
typedef BdaSolverStatus::Status Status;
private:
@ -120,7 +134,7 @@ public:
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
/// Get resulting vector x after linear solve, also includes post processing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array

View File

@ -51,31 +51,28 @@ namespace bda
using Opm::OpmLog;
openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
}
openclSolverBackend::~openclSolverBackend() {
template <unsigned int block_size>
openclSolverBackend<block_size>::~openclSolverBackend() {
finalize();
}
// divide A by B, and round up: return (int)ceil(A/B)
unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B)
template <unsigned int block_size>
unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
{
return A / B + (A % B > 0);
}
// just for verifying and debugging
bool equal(float a, float b)
{
const float tol_abs = 1e-2;
const float tol_rel = 1e-2;
return std::abs(a - b) <= std::max(tol_rel * std::max(std::abs(a), std::abs(b)), tol_abs);
}
double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
template <unsigned int block_size>
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
{
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 1024;
@ -106,7 +103,8 @@ double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out
return gpu_sum;
}
double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
template <unsigned int block_size>
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
{
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 1024;
@ -138,7 +136,8 @@ double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
return gpu_norm;
}
void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
template <unsigned int block_size>
void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
{
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32;
@ -159,7 +158,8 @@ void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
}
}
void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
template <unsigned int block_size>
void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
{
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32;
@ -180,7 +180,8 @@ void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, con
}
}
void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
template <unsigned int block_size>
void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
{
double t1 = 0.0, t2 = 0.0;
const unsigned int work_group_size = 32;
@ -203,7 +204,8 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::B
}
void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
float it;
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
@ -360,10 +362,10 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResu
}
void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
this->N = N_;
this->nnz = nnz_;
this->block_size = dim;
this->nnzb = nnz_ / block_size / block_size;
Nb = (N + dim - 1) / dim;
@ -542,7 +544,9 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, in
initialized = true;
} // end initialize()
void openclSolverBackend::finalize() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
delete[] rb;
delete[] tmp;
#if COPY_ROW_BY_ROW
@ -551,7 +555,8 @@ void openclSolverBackend::finalize() {
} // end finalize()
void openclSolverBackend::copy_system_to_gpu() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
@ -588,7 +593,8 @@ void openclSolverBackend::copy_system_to_gpu() {
// don't copy rowpointers and colindices, they stay the same
void openclSolverBackend::update_system_on_gpu() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
@ -622,7 +628,8 @@ void openclSolverBackend::update_system_on_gpu() {
} // end update_system_on_gpu()
bool openclSolverBackend::analyse_matrix() {
template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyse_matrix() {
double t1 = 0.0, t2 = 0.0;
@ -630,7 +637,7 @@ bool openclSolverBackend::analyse_matrix() {
t1 = second();
}
bool success = prec->init(mat, block_size);
bool success = prec->init(mat);
int work_group_size = 32;
int num_work_groups = ceilDivision(N, work_group_size);
int total_work_items = num_work_groups * work_group_size;
@ -654,7 +661,8 @@ bool openclSolverBackend::analyse_matrix() {
} // end analyse_matrix()
void openclSolverBackend::update_system(double *vals, double *b) {
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
t1 = second();
@ -673,7 +681,8 @@ void openclSolverBackend::update_system(double *vals, double *b) {
} // end update_system()
bool openclSolverBackend::create_preconditioner() {
template <unsigned int block_size>
bool openclSolverBackend<block_size>::create_preconditioner() {
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
@ -692,7 +701,8 @@ bool openclSolverBackend::create_preconditioner() {
} // end create_preconditioner()
void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
template <unsigned int block_size>
void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
@ -713,7 +723,8 @@ void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResul
// copy result to host memory
// caller must be sure that x is a valid array
void openclSolverBackend::get_result(double *x) {
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) {
double t1 = 0.0, t2 = 0.0;
if (verbosity > 2) {
@ -732,32 +743,43 @@ void openclSolverBackend::get_result(double *x) {
} // end get_result()
typedef BdaSolverStatus::Status Status;
typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
BdaSolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
template <unsigned int block_size>
Status openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols);
if (analysis_done == false) {
if (!analyse_matrix()) {
return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
return Status::BDA_SOLVER_ANALYSIS_FAILED;
}
}
update_system(vals, b);
if (!create_preconditioner()) {
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu();
} else {
update_system(vals, b);
if (!create_preconditioner()) {
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
update_system_on_gpu();
}
solve_system(wellContribs, res);
return BdaSolverStatus::BDA_SOLVER_SUCCESS;
return Status::BDA_SOLVER_SUCCESS;
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend(int, int, double); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda

View File

@ -31,15 +31,29 @@
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp>
typedef bda::BILU0 Preconditioner;
namespace bda
{
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
class openclSolverBackend : public BdaSolver
template <unsigned int block_size>
class openclSolverBackend : public BdaSolver<block_size>
{
typedef BdaSolver<block_size> Base;
typedef BILU0<block_size> Preconditioner;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::maxit;
using Base::tolerance;
using Base::second;
using Base::initialized;
typedef BdaSolverStatus::Status Status;
private:
double *rb; // reordered b vector, the matrix is reordered, so b must also be
@ -182,7 +196,7 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array