openclSolverBackend: template Scalar type

This commit is contained in:
Arne Morten Kvarving 2024-04-16 10:29:33 +02:00
parent d0773ef4f7
commit 18f42b51b2
4 changed files with 182 additions and 165 deletions

View File

@ -80,7 +80,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
} else if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL
use_gpu = true;
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
using OCL = Accelerator::openclSolverBackend<double,block_size>;
backend = std::make_unique<OCL>(linear_solver_verbosity,
maxit,
tolerance,
platformID,
deviceID,
opencl_ilu_parallel,
linsolver);
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
@ -307,11 +314,13 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
initWellContributions([[maybe_unused]] WellContributions<double>& wellContribs,
[[maybe_unused]] unsigned N)
{
if(accelerator_mode.compare("opencl") == 0){
if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
static_cast<WellContributionsOCL<double>&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
openclBackend->queue.get());
using OCL = Accelerator::openclSolverBackend<double,block_size>;
const auto openclBackend = static_cast<const OCL*>(backend.get());
using WCOCL = WellContributionsOCL<double>;
static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
openclBackend->queue.get());
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif

View File

@ -88,7 +88,7 @@ private:
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
// Analyze the AMG hierarchy build by Dune

View File

@ -37,19 +37,15 @@
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
openclSolverBackend<block_size>::
template<class Scalar, unsigned int block_size>
openclSolverBackend<Scalar,block_size>::
openclSolverBackend(int verbosity_,
int maxit_,
double tolerance_,
Scalar tolerance_,
unsigned int platformID_,
unsigned int deviceID_,
bool opencl_ilu_parallel_,
@ -59,22 +55,23 @@ openclSolverBackend(int verbosity_,
{
bool use_cpr, use_isai;
if (linsolver.compare("ilu0") == 0) {
if (linsolver == "ilu0") {
use_cpr = false;
use_isai = false;
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
} else if (linsolver == "cpr_quasiimpes") {
use_cpr = true;
use_isai = false;
} else if (linsolver.compare("isai") == 0) {
} else if (linsolver == "isai") {
use_cpr = false;
use_isai = true;
} else if (linsolver.compare("cpr_trueimpes") == 0) {
OPM_THROW(std::logic_error, "Error openclSolver does not support --linerar-solver=cpr_trueimpes");
} else if (linsolver == "cpr_trueimpes") {
OPM_THROW(std::logic_error, "Error openclSolver does not support "
"--linear-solver=cpr_trueimpes");
} else {
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
}
using PreconditionerType = Preconditioner<double,block_size>;
using PreconditionerType = Preconditioner<Scalar,block_size>;
if (use_cpr) {
prec = PreconditionerType::create(PreconditionerType::Type::CPR,
opencl_ilu_parallel, verbosity);
@ -115,7 +112,7 @@ openclSolverBackend(int verbosity_,
out.clear();
if (platforms.size() <= platformID) {
OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
OPM_THROW(std::logic_error, "Error: Invalid OpenCL platform ID selected");
} else {
std::string platform_info;
out << "Chosen:\n";
@ -131,7 +128,8 @@ openclSolverBackend(int verbosity_,
platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.empty()) {
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
OPM_THROW(std::logic_error, "Error openclSolver is selected but "
"no OpenCL devices are found");
}
out << "Found " << devices.size() << " OpenCL devices" << "\n";
@ -215,8 +213,7 @@ openclSolverBackend(int verbosity_,
context = std::make_shared<cl::Context>(devices[0]);
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
OpenclKernels<double>::init(context.get(), queue.get(), devices, verbosity);
OpenclKernels<Scalar>::init(context.get(), queue.get(), devices, verbosity);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -229,10 +226,10 @@ openclSolverBackend(int verbosity_,
}
}
template <unsigned int block_size>
openclSolverBackend<block_size>::
template<class Scalar, unsigned int block_size>
openclSolverBackend<Scalar,block_size>::
openclSolverBackend(int verbosity_, int maxit_,
double tolerance_, bool opencl_ilu_parallel_)
Scalar tolerance_, bool opencl_ilu_parallel_)
: Base(verbosity_, maxit_, tolerance_)
, opencl_ilu_parallel(opencl_ilu_parallel_)
{
@ -240,20 +237,22 @@ openclSolverBackend(int verbosity_, int maxit_,
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
setOpencl(std::shared_ptr<cl::Context>& context_,
std::shared_ptr<cl::CommandQueue>& queue_)
{
context = context_;
queue = queue_;
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::
gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
float it;
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
double norm, norm_0;
Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
Scalar norm, norm_0;
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
@ -263,15 +262,15 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// set initial values
events.resize(5);
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[0]);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(Scalar) * N, nullptr, &events[1]);
rho = 1.0;
alpha = 1.0;
omega = 1.0;
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(Scalar) * N, nullptr, &events[3]);
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
cl::WaitForEvents(events);
events.clear();
@ -280,7 +279,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
norm_0 = norm;
if (verbosity > 1) {
@ -294,11 +293,11 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
}
for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho;
rho = OpenclKernels<double>::dot(d_rw, d_r, d_tmp, N);
rho = OpenclKernels<Scalar>::dot(d_rw, d_r, d_tmp, N);
if (it > 1) {
beta = (rho / rhop) * (alpha / omega);
OpenclKernels<double>::custom(d_p, d_v, d_r, omega, beta, N);
OpenclKernels<Scalar>::custom(d_p, d_v, d_r, omega, beta, N);
}
if (verbosity >= 3) {
queue->finish();
@ -315,7 +314,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
}
// v = A * pw
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
if (verbosity >= 3) {
queue->finish();
t_spmv.stop();
@ -324,7 +323,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_pw, d_v);
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_pw, d_v);
}
if (verbosity >= 3) {
queue->finish();
@ -332,11 +331,11 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
t_rest.start();
}
tmp1 = OpenclKernels<double>::dot(d_rw, d_v, d_tmp, N);
tmp1 = OpenclKernels<Scalar>::dot(d_rw, d_v, d_tmp, N);
alpha = rho / tmp1;
OpenclKernels<double>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels<double>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
OpenclKernels<Scalar>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels<Scalar>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
if (verbosity >= 3) {
queue->finish();
t_rest.stop();
@ -360,8 +359,8 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
}
// t = A * s
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
if(verbosity >= 3){
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
if (verbosity >= 3) {
queue->finish();
t_spmv.stop();
t_well.start();
@ -369,7 +368,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_s, d_t);
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_s, d_t);
}
if (verbosity >= 3) {
queue->finish();
@ -377,12 +376,12 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
t_rest.start();
}
tmp1 = OpenclKernels<double>::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels<double>::dot(d_t, d_t, d_tmp, N);
tmp1 = OpenclKernels<Scalar>::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels<Scalar>::dot(d_t, d_t, d_tmp, N);
omega = tmp1 / tmp2;
OpenclKernels<double>::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels<double>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
OpenclKernels<Scalar>::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels<Scalar>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
if (verbosity >= 3) {
queue->finish();
t_rest.stop();
@ -399,7 +398,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
}
}
res.iterations = std::min(it, (float)maxit);
res.iterations = std::min(it, static_cast<float>(maxit));
res.reduction = norm / norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
res.elapsed = t_total.stop();
@ -407,7 +406,8 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
if (verbosity > 0) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
out << "=== converged: " << res.converged << ", conv_rate: "
<< res.conv_rate << ", time: " << res.elapsed <<
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str());
}
@ -422,10 +422,10 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::
initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
@ -456,22 +456,21 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
mat = matrix;
jacMat = jacMatrix;
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * nnz);
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -486,8 +485,10 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
copy_system_to_gpu()
{
Timer t;
events.resize(5);
@ -495,18 +496,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, vals_contiguous.data(),
nullptr, &events[0]);
#else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
#endif
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0,
sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0,
sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
sizeof(Scalar) * N, h_b, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
cl::WaitForEvents(events);
events.clear();
@ -523,8 +531,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
update_system_on_gpu()
{
Timer t;
events.resize(3);
@ -532,16 +542,21 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, vals_contiguous.data(),
nullptr, &events[0]);
#else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
#endif
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
sizeof(Scalar) * N, h_b, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
@ -557,9 +572,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
}
} // end update_system_on_gpu()
template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyze_matrix() {
template<class Scalar, unsigned int block_size>
bool openclSolverBackend<Scalar,block_size>::
analyze_matrix()
{
Timer t;
bool success;
@ -579,9 +595,10 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
return success;
} // end analyze_matrix()
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
update_system(Scalar* vals, Scalar* b)
{
Timer t;
mat->nnzValues = vals;
@ -594,9 +611,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
}
} // end update_system()
template <unsigned int block_size>
bool openclSolverBackend<block_size>::create_preconditioner() {
template<class Scalar, unsigned int block_size>
bool openclSolverBackend<Scalar,block_size>::
create_preconditioner()
{
Timer t;
bool result;
@ -613,10 +631,9 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
return result;
} // end create_preconditioner()
template <unsigned int block_size>
void openclSolverBackend<block_size>::
solve_system(WellContributions<double>& wellContribs, BdaResult& res)
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
Timer t;
@ -625,7 +642,8 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
gpu_pbicgstab(wellContribs, res);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
oss << "openclSolverBackend::solve_system error: " << error.what()
<< "(" << error.err() << ")\n";
oss << getErrorString(error.err());
// rethrow exception
OPM_THROW(std::logic_error, oss.str());
@ -641,14 +659,15 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
}
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
get_result(Scalar* x)
{
Timer t;
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(Scalar) * N, x);
if (verbosity > 2) {
std::ostringstream out;
@ -657,12 +676,12 @@ void openclSolverBackend<block_size>::get_result(double *x) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus openclSolverBackend<block_size>::
solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
WellContributions<double>& wellContribs,
template<class Scalar, unsigned int block_size>
SolverStatus openclSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
@ -688,21 +707,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class openclSolverBackend<T,1>; \
template class openclSolverBackend<T,2>; \
template class openclSolverBackend<T,3>; \
template class openclSolverBackend<T,4>; \
template class openclSolverBackend<T,5>; \
template class openclSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend( \
int, int, double, unsigned int, unsigned int, bool, std::string); \
template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -27,16 +27,13 @@
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class openclSolverBackend : public BdaSolver<double,block_size>
template<class Scalar, unsigned int block_size>
class openclSolverBackend : public BdaSolver<Scalar,block_size>
{
using Base = BdaSolver<double,block_size>;
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -50,8 +47,8 @@ class openclSolverBackend : public BdaSolver<double,block_size>
using Base::initialized;
private:
double *h_b = nullptr; // b vector, on host
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
Scalar* h_b = nullptr; // b vector, on host
std::vector<Scalar> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
// OpenCL variables must be reusable, they are initialized in initialize()
cl::Buffer d_Avals, d_Acols, d_Arows; // matrix in BSR format on GPU
@ -63,12 +60,12 @@ private:
bool useJacMatrix = false;
std::unique_ptr<Preconditioner<double,block_size> > prec;
std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
// can perform blocked ILU0 and AMG on pressure component
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
bool analysis_done = false;
std::shared_ptr<BlockedMatrix<double>> mat{}; // original matrix
std::shared_ptr<BlockedMatrix<double>> jacMat{}; // matrix for preconditioner
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
bool opencl_ilu_parallel; // parallelize ILU operations (with level_scheduling)
std::vector<cl::Event> events;
cl_int err;
@ -76,13 +73,13 @@ private:
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Copy linear system to GPU
void copy_system_to_gpu();
@ -90,7 +87,7 @@ private:
/// Reassign pointers, in case the addresses of the Dune variables have changed
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vector b, contains N values
void update_system(double *vals, double *b);
void update_system(Scalar* vals, Scalar* b);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu();
@ -107,11 +104,11 @@ private:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// could be empty
/// \param[inout] res summary of solver result
void solve_system(WellContributions<double>& wellContribs, BdaResult& res);
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
public:
std::shared_ptr<cl::Context> context;
std::shared_ptr<cl::CommandQueue> queue;
std::shared_ptr<cl::Context> context{};
std::shared_ptr<cl::CommandQueue> queue{};
/// Construct a openclSolver
/// \param[in] linear_solver_verbosity verbosity of openclSolver
@ -122,11 +119,13 @@ public:
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
/// only ilu0, cpr_quasiimpes and isai are supported
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
bool opencl_ilu_parallel, std::string linsolver);
openclSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance,
unsigned int platformID, unsigned int deviceID,
bool opencl_ilu_parallel, std::string linsolver);
/// For the CPR coarse solver
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
openclSolverBackend(int linear_solver_verbosity, int maxit,
Scalar tolerance, bool opencl_ilu_parallel);
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] matrix matrix A
@ -135,10 +134,10 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
WellContributions<double>& wellContribs,
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
@ -147,19 +146,16 @@ public:
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
/// Set OpenCL objects
/// This class either creates them based on platformID and deviceID or receives them through this function
/// \param[in] context the opencl context to be used
/// \param[in] queue the opencl queue to be used
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
void setOpencl(std::shared_ptr<cl::Context>& context,
std::shared_ptr<cl::CommandQueue>& queue);
}; // end class openclSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif