mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
openclSolverBackend: template Scalar type
This commit is contained in:
parent
d0773ef4f7
commit
18f42b51b2
@ -80,7 +80,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
} else if (accelerator_mode.compare("opencl") == 0) {
|
||||
#if HAVE_OPENCL
|
||||
use_gpu = true;
|
||||
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
|
||||
using OCL = Accelerator::openclSolverBackend<double,block_size>;
|
||||
backend = std::make_unique<OCL>(linear_solver_verbosity,
|
||||
maxit,
|
||||
tolerance,
|
||||
platformID,
|
||||
deviceID,
|
||||
opencl_ilu_parallel,
|
||||
linsolver);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
@ -307,11 +314,13 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
|
||||
initWellContributions([[maybe_unused]] WellContributions<double>& wellContribs,
|
||||
[[maybe_unused]] unsigned N)
|
||||
{
|
||||
if(accelerator_mode.compare("opencl") == 0){
|
||||
if (accelerator_mode.compare("opencl") == 0) {
|
||||
#if HAVE_OPENCL
|
||||
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
|
||||
static_cast<WellContributionsOCL<double>&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
|
||||
openclBackend->queue.get());
|
||||
using OCL = Accelerator::openclSolverBackend<double,block_size>;
|
||||
const auto openclBackend = static_cast<const OCL*>(backend.get());
|
||||
using WCOCL = WellContributionsOCL<double>;
|
||||
static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
|
||||
openclBackend->queue.get());
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
|
@ -88,7 +88,7 @@ private:
|
||||
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
|
||||
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
||||
|
||||
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
|
||||
std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
|
||||
bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
|
||||
|
||||
// Analyze the AMG hierarchy build by Dune
|
||||
|
@ -37,19 +37,15 @@
|
||||
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||
#define COPY_ROW_BY_ROW 0
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::
|
||||
template<class Scalar, unsigned int block_size>
|
||||
openclSolverBackend<Scalar,block_size>::
|
||||
openclSolverBackend(int verbosity_,
|
||||
int maxit_,
|
||||
double tolerance_,
|
||||
Scalar tolerance_,
|
||||
unsigned int platformID_,
|
||||
unsigned int deviceID_,
|
||||
bool opencl_ilu_parallel_,
|
||||
@ -59,22 +55,23 @@ openclSolverBackend(int verbosity_,
|
||||
{
|
||||
bool use_cpr, use_isai;
|
||||
|
||||
if (linsolver.compare("ilu0") == 0) {
|
||||
if (linsolver == "ilu0") {
|
||||
use_cpr = false;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
|
||||
} else if (linsolver == "cpr_quasiimpes") {
|
||||
use_cpr = true;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("isai") == 0) {
|
||||
} else if (linsolver == "isai") {
|
||||
use_cpr = false;
|
||||
use_isai = true;
|
||||
} else if (linsolver.compare("cpr_trueimpes") == 0) {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver does not support --linerar-solver=cpr_trueimpes");
|
||||
} else if (linsolver == "cpr_trueimpes") {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver does not support "
|
||||
"--linear-solver=cpr_trueimpes");
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
|
||||
}
|
||||
|
||||
using PreconditionerType = Preconditioner<double,block_size>;
|
||||
using PreconditionerType = Preconditioner<Scalar,block_size>;
|
||||
if (use_cpr) {
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::CPR,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
@ -115,7 +112,7 @@ openclSolverBackend(int verbosity_,
|
||||
out.clear();
|
||||
|
||||
if (platforms.size() <= platformID) {
|
||||
OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
|
||||
OPM_THROW(std::logic_error, "Error: Invalid OpenCL platform ID selected");
|
||||
} else {
|
||||
std::string platform_info;
|
||||
out << "Chosen:\n";
|
||||
@ -131,7 +128,8 @@ openclSolverBackend(int verbosity_,
|
||||
platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);
|
||||
|
||||
if (devices.empty()) {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
|
||||
OPM_THROW(std::logic_error, "Error openclSolver is selected but "
|
||||
"no OpenCL devices are found");
|
||||
}
|
||||
out << "Found " << devices.size() << " OpenCL devices" << "\n";
|
||||
|
||||
@ -215,8 +213,7 @@ openclSolverBackend(int verbosity_,
|
||||
context = std::make_shared<cl::Context>(devices[0]);
|
||||
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
|
||||
|
||||
OpenclKernels<double>::init(context.get(), queue.get(), devices, verbosity);
|
||||
|
||||
OpenclKernels<Scalar>::init(context.get(), queue.get(), devices, verbosity);
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
@ -229,10 +226,10 @@ openclSolverBackend(int verbosity_,
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::
|
||||
template<class Scalar, unsigned int block_size>
|
||||
openclSolverBackend<Scalar,block_size>::
|
||||
openclSolverBackend(int verbosity_, int maxit_,
|
||||
double tolerance_, bool opencl_ilu_parallel_)
|
||||
Scalar tolerance_, bool opencl_ilu_parallel_)
|
||||
: Base(verbosity_, maxit_, tolerance_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
@ -240,20 +237,22 @@ openclSolverBackend(int verbosity_, int maxit_,
|
||||
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::
|
||||
gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
float it;
|
||||
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||
Scalar norm, norm_0;
|
||||
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
|
||||
@ -263,15 +262,15 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
// set initial values
|
||||
events.resize(5);
|
||||
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
|
||||
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
|
||||
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[0]);
|
||||
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(Scalar) * N, nullptr, &events[1]);
|
||||
rho = 1.0;
|
||||
alpha = 1.0;
|
||||
omega = 1.0;
|
||||
|
||||
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
||||
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
|
||||
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
||||
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
|
||||
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(Scalar) * N, nullptr, &events[3]);
|
||||
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -280,7 +279,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
|
||||
}
|
||||
|
||||
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
|
||||
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
|
||||
norm_0 = norm;
|
||||
|
||||
if (verbosity > 1) {
|
||||
@ -294,11 +293,11 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
}
|
||||
for (it = 0.5; it < maxit; it += 0.5) {
|
||||
rhop = rho;
|
||||
rho = OpenclKernels<double>::dot(d_rw, d_r, d_tmp, N);
|
||||
rho = OpenclKernels<Scalar>::dot(d_rw, d_r, d_tmp, N);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho / rhop) * (alpha / omega);
|
||||
OpenclKernels<double>::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
OpenclKernels<Scalar>::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
@ -315,7 +314,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
}
|
||||
|
||||
// v = A * pw
|
||||
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_spmv.stop();
|
||||
@ -324,7 +323,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_pw, d_v);
|
||||
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
@ -332,11 +331,11 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
t_rest.start();
|
||||
}
|
||||
|
||||
tmp1 = OpenclKernels<double>::dot(d_rw, d_v, d_tmp, N);
|
||||
tmp1 = OpenclKernels<Scalar>::dot(d_rw, d_v, d_tmp, N);
|
||||
alpha = rho / tmp1;
|
||||
OpenclKernels<double>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels<double>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
|
||||
OpenclKernels<Scalar>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels<Scalar>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_rest.stop();
|
||||
@ -360,8 +359,8 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
}
|
||||
|
||||
// t = A * s
|
||||
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
if(verbosity >= 3){
|
||||
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_spmv.stop();
|
||||
t_well.start();
|
||||
@ -369,7 +368,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_s, d_t);
|
||||
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_s, d_t);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
@ -377,12 +376,12 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
t_rest.start();
|
||||
}
|
||||
|
||||
tmp1 = OpenclKernels<double>::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels<double>::dot(d_t, d_t, d_tmp, N);
|
||||
tmp1 = OpenclKernels<Scalar>::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels<Scalar>::dot(d_t, d_t, d_tmp, N);
|
||||
omega = tmp1 / tmp2;
|
||||
OpenclKernels<double>::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels<double>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
|
||||
OpenclKernels<Scalar>::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels<Scalar>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_rest.stop();
|
||||
@ -399,7 +398,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
}
|
||||
}
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.iterations = std::min(it, static_cast<float>(maxit));
|
||||
res.reduction = norm / norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
|
||||
res.elapsed = t_total.stop();
|
||||
@ -407,7 +406,8 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
out << "=== converged: " << res.converged << ", conv_rate: "
|
||||
<< res.conv_rate << ", time: " << res.elapsed <<
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
@ -422,10 +422,10 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
@ -456,22 +456,21 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
mat = matrix;
|
||||
jacMat = jacMatrix;
|
||||
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
|
||||
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
|
||||
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * nnz);
|
||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
@ -486,8 +485,10 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
copy_system_to_gpu()
|
||||
{
|
||||
Timer t;
|
||||
events.resize(5);
|
||||
|
||||
@ -495,18 +496,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, vals_contiguous.data(),
|
||||
nullptr, &events[0]);
|
||||
#else
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
||||
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0,
|
||||
sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0,
|
||||
sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
|
||||
sizeof(Scalar) * N, h_b, nullptr, &events[3]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -523,8 +531,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
update_system_on_gpu()
|
||||
{
|
||||
Timer t;
|
||||
events.resize(3);
|
||||
|
||||
@ -532,16 +542,21 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, vals_contiguous.data(),
|
||||
nullptr, &events[0]);
|
||||
#else
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
|
||||
sizeof(Scalar) * N, h_b, nullptr, &events[1]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -557,9 +572,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::analyze_matrix() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclSolverBackend<Scalar,block_size>::
|
||||
analyze_matrix()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
bool success;
|
||||
@ -579,9 +595,10 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
|
||||
return success;
|
||||
} // end analyze_matrix()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
update_system(Scalar* vals, Scalar* b)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
mat->nnzValues = vals;
|
||||
@ -594,9 +611,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
||||
}
|
||||
} // end update_system()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::create_preconditioner() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclSolverBackend<Scalar,block_size>::
|
||||
create_preconditioner()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
bool result;
|
||||
@ -613,10 +631,9 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
|
||||
return result;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::
|
||||
solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
@ -625,7 +642,8 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
|
||||
oss << "openclSolverBackend::solve_system error: " << error.what()
|
||||
<< "(" << error.err() << ")\n";
|
||||
oss << getErrorString(error.err());
|
||||
// rethrow exception
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
@ -641,14 +659,15 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
}
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
get_result(Scalar* x)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(Scalar) * N, x);
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
@ -657,12 +676,12 @@ void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus openclSolverBackend<block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
|
||||
WellContributions<double>& wellContribs,
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus openclSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
@ -688,21 +707,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class openclSolverBackend<T,1>; \
|
||||
template class openclSolverBackend<T,2>; \
|
||||
template class openclSolverBackend<T,3>; \
|
||||
template class openclSolverBackend<T,4>; \
|
||||
template class openclSolverBackend<T,5>; \
|
||||
template class openclSolverBackend<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template openclSolverBackend<n>::openclSolverBackend( \
|
||||
int, int, double, unsigned int, unsigned int, bool, std::string); \
|
||||
template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
|
||||
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -27,16 +27,13 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend : public BdaSolver<double,block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class openclSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
using Base = BdaSolver<double,block_size>;
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -50,8 +47,8 @@ class openclSolverBackend : public BdaSolver<double,block_size>
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
double *h_b = nullptr; // b vector, on host
|
||||
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
Scalar* h_b = nullptr; // b vector, on host
|
||||
std::vector<Scalar> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
|
||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||
cl::Buffer d_Avals, d_Acols, d_Arows; // matrix in BSR format on GPU
|
||||
@ -63,12 +60,12 @@ private:
|
||||
|
||||
bool useJacMatrix = false;
|
||||
|
||||
std::unique_ptr<Preconditioner<double,block_size> > prec;
|
||||
std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
|
||||
// can perform blocked ILU0 and AMG on pressure component
|
||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||
bool analysis_done = false;
|
||||
std::shared_ptr<BlockedMatrix<double>> mat{}; // original matrix
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMat{}; // matrix for preconditioner
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
|
||||
bool opencl_ilu_parallel; // parallelize ILU operations (with level_scheduling)
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
@ -76,13 +73,13 @@ private:
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
|
||||
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
||||
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Copy linear system to GPU
|
||||
void copy_system_to_gpu();
|
||||
@ -90,7 +87,7 @@ private:
|
||||
/// Reassign pointers, in case the addresses of the Dune variables have changed
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vector b, contains N values
|
||||
void update_system(double *vals, double *b);
|
||||
void update_system(Scalar* vals, Scalar* b);
|
||||
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
void update_system_on_gpu();
|
||||
@ -107,11 +104,11 @@ private:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// could be empty
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(WellContributions<double>& wellContribs, BdaResult& res);
|
||||
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
public:
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::shared_ptr<cl::Context> context{};
|
||||
std::shared_ptr<cl::CommandQueue> queue{};
|
||||
|
||||
/// Construct a openclSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of openclSolver
|
||||
@ -122,11 +119,13 @@ public:
|
||||
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
|
||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
||||
/// only ilu0, cpr_quasiimpes and isai are supported
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
|
||||
bool opencl_ilu_parallel, std::string linsolver);
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance,
|
||||
unsigned int platformID, unsigned int deviceID,
|
||||
bool opencl_ilu_parallel, std::string linsolver);
|
||||
|
||||
/// For the CPR coarse solver
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit,
|
||||
Scalar tolerance, bool opencl_ilu_parallel);
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] matrix matrix A
|
||||
@ -135,10 +134,10 @@ public:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
|
||||
WellContributions<double>& wellContribs,
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
@ -147,19 +146,16 @@ public:
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
/// Set OpenCL objects
|
||||
/// This class either creates them based on platformID and deviceID or receives them through this function
|
||||
/// \param[in] context the opencl context to be used
|
||||
/// \param[in] queue the opencl queue to be used
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
std::shared_ptr<cl::CommandQueue>& queue);
|
||||
}; // end class openclSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user