cusparseSolverBackend: template Scalar type

This commit is contained in:
Arne Morten Kvarving 2024-04-16 10:29:33 +02:00
parent 18f42b51b2
commit 23250b87e3
3 changed files with 154 additions and 147 deletions

View File

@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
if (accelerator_mode.compare("cusparse") == 0) { if (accelerator_mode.compare("cusparse") == 0) {
#if HAVE_CUDA #if HAVE_CUDA
use_gpu = true; use_gpu = true;
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID)); using CU = Accelerator::cusparseSolverBackend<double,block_size>;
backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
#else #else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake"); OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif #endif

View File

@ -44,23 +44,18 @@
extern std::shared_ptr<std::thread> copyThread; extern std::shared_ptr<std::thread> copyThread;
#endif // HAVE_OPENMP #endif // HAVE_OPENMP
namespace Opm namespace Opm::Accelerator {
{
namespace Accelerator
{
using Opm::OpmLog;
using Dune::Timer; using Dune::Timer;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW; const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
template<class Scalar, unsigned int block_size>
template <unsigned int block_size> cusparseSolverBackend<Scalar, block_size>::
cusparseSolverBackend<block_size>::
cusparseSolverBackend(int verbosity_, int maxit_, cusparseSolverBackend(int verbosity_, int maxit_,
double tolerance_, unsigned int deviceID_) Scalar tolerance_, unsigned int deviceID_)
: Base(verbosity_, maxit_, tolerance_, deviceID_) : Base(verbosity_, maxit_, tolerance_, deviceID_)
{ {
// initialize CUDA device, stream and libraries // initialize CUDA device, stream and libraries
@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
cudaGetDeviceProperties(&props, deviceID); cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties"); cudaCheckLastError("Could not get device properties");
std::ostringstream out; std::ostringstream out;
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor; out << "Name GPU: " << props.name << ", Compute Capability: "
<< props.major << "." << props.minor;
OpmLog::info(out.str()); OpmLog::info(out.str());
cudaStreamCreate(&stream); cudaStreamCreate(&stream);
@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
cudaCheckLastError("Could not set stream to cusparse"); cudaCheckLastError("Could not set stream to cusparse");
} }
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
cusparseSolverBackend<block_size>::~cusparseSolverBackend() { cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
{
finalize(); finalize();
} }
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<block_size>:: void cusparseSolverBackend<Scalar,block_size>::
gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res) gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
{ {
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
int n = N; int n = N;
double rho = 1.0, rhop; Scalar rho = 1.0, rhop;
double alpha, nalpha, beta; Scalar alpha, nalpha, beta;
double omega, nomega, tmp1, tmp2; Scalar omega, nomega, tmp1, tmp2;
double norm, norm_0; Scalar norm, norm_0;
double zero = 0.0; Scalar zero = 0.0;
double one = 1.0; Scalar one = 1.0;
double mone = -1.0; Scalar mone = -1.0;
float it; float it;
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream); static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
} }
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r); cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// apply wellContributions // apply wellContributions
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v); static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
} }
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1); cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// apply wellContributions // apply wellContributions
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t); static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
} }
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1); cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
cublasDnrm2(cublasHandle, n, d_r, 1, &norm); cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if (norm < tolerance * norm_0) { if (norm < tolerance * norm_0) {
break; break;
} }
@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
if (verbosity > 0) { if (verbosity > 0) {
std::ostringstream out; std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \ out << "=== converged: " << res.converged << ", conv_rate: "
", time per iteration: " << res.elapsed / it << ", iterations: " << it; << res.conv_rate << ", time: " << res.elapsed
<< ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} }
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<block_size>:: void cusparseSolverBackend<Scalar,block_size>::
initialize(std::shared_ptr<BlockedMatrix<double>> matrix, initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<double>> jacMatrix) std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{ {
this->Nb = matrix->Nb; this->Nb = matrix->Nb;
this->N = Nb * block_size; this->N = Nb * block_size;
@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
} }
std::ostringstream out; std::ostringstream out;
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n"; out << "Initializing GPU, matrix size: " << Nb
<< " blockrows, nnz: " << nnzb << " blocks\n";
if (useJacMatrix) { if (useJacMatrix) {
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n"; out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
} }
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; out << "Maxit: " << maxit << std::scientific
<< ", tolerance: " << tolerance << "\n";
OpmLog::info(out.str()); OpmLog::info(out.str());
cudaMalloc((void**)&d_x, sizeof(double) * N); cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N); cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
cudaMalloc((void**)&d_r, sizeof(double) * N); cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
cudaMalloc((void**)&d_rw, sizeof(double) * N); cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
cudaMalloc((void**)&d_p, sizeof(double) * N); cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
cudaMalloc((void**)&d_pw, sizeof(double) * N); cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
cudaMalloc((void**)&d_s, sizeof(double) * N); cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
cudaMalloc((void**)&d_t, sizeof(double) * N); cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
cudaMalloc((void**)&d_v, sizeof(double) * N); cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz); cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb); cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1)); cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
if (useJacMatrix) { if (useJacMatrix) {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size); cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec); cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1)); cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
} else { } else {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz); cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
d_mCols = d_bCols; d_mCols = d_bCols;
d_mRows = d_bRows; d_mRows = d_bRows;
} }
cudaCheckLastError("Could not allocate enough memory on GPU"); cudaCheckLastError("Could not allocate enough memory on GPU");
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz); cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
cudaCheckLastError("Could not allocate pinned memory"); cudaCheckLastError("Could not allocate pinned memory");
#endif #endif
initialized = true; initialized = true;
} // end initialize() } // end initialize()
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<block_size>::finalize() { void cusparseSolverBackend<Scalar,block_size>::finalize()
{
if (initialized) { if (initialized) {
cudaFree(d_x); cudaFree(d_x);
cudaFree(d_b); cudaFree(d_b);
@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
} }
} // end finalize() } // end finalize()
template<class Scalar, unsigned int block_size>
template <unsigned int block_size> void cusparseSolverBackend<Scalar,block_size>::
void cusparseSolverBackend<block_size>:: copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix, Scalar* b,
double *b, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
{ {
Timer t; Timer t;
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
for (int i = 0; i < Nb; ++i) { for (int i = 0; i < Nb; ++i) {
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i]; int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size); memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size; sum += size_row * block_size * block_size;
} }
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, vals_contiguous,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
#else #else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) { if (useJacMatrix) {
#if HAVE_OPENMP #if HAVE_OPENMP
if(omp_get_max_threads() > 1) if(omp_get_max_threads() > 1)
copyThread->join(); copyThread->join();
#endif #endif
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
} else { } else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); cudaMemcpyAsync(d_mVals, d_bVals,
nnz * sizeof(Scalar),
cudaMemcpyDeviceToDevice, stream);
} }
#endif #endif
if (useJacMatrix) { if (useJacMatrix) {
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
cudaMemcpyHostToDevice, stream);
} }
if (verbosity >= 3) { if (verbosity >= 3) {
@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
} }
} // end copy_system_to_gpu() } // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same // don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<block_size>:: void cusparseSolverBackend<Scalar,block_size>::
update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix, update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
double *b, Scalar* b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix) std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{ {
Timer t; Timer t;
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
for (int i = 0; i < Nb; ++i) { for (int i = 0; i < Nb; ++i) {
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i]; int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size); memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size; sum += size_row * block_size * block_size;
} }
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, vals_contiguous,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
#else #else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) { if (useJacMatrix) {
#if HAVE_OPENMP #if HAVE_OPENMP
if(omp_get_max_threads() > 1) if (omp_get_max_threads() > 1) {
copyThread->join(); copyThread->join();
}
#endif #endif
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
} else { } else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar),
cudaMemcpyDeviceToDevice, stream);
} }
#endif #endif
@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
} }
} // end update_system_on_gpu() } // end update_system_on_gpu()
template<class Scalar, unsigned int block_size>
template <unsigned int block_size> bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
bool cusparseSolverBackend<block_size>::analyse_matrix() { {
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t; Timer t;
@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
return true; return true;
} // end analyse_matrix() } // end analyse_matrix()
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() { bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
{
Timer t; Timer t;
cusparseDbsrilu02(cusparseHandle, order, \ cusparseDbsrilu02(cusparseHandle, order, \
@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
return true; return true;
} // end create_preconditioner() } // end create_preconditioner()
template<class Scalar, unsigned int block_size>
template <unsigned int block_size> void cusparseSolverBackend<Scalar,block_size>::
void cusparseSolverBackend<block_size>:: solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
solve_system(WellContributions<double>& wellContribs, BdaResult& res)
{ {
// actually solve // actually solve
gpu_pbicgstab(wellContribs, res); gpu_pbicgstab(wellContribs, res);
@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
cudaCheckLastError("Something went wrong during the GPU solve"); cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system() } // end solve_system()
// copy result to host memory // copy result to host memory
// caller must be sure that x is a valid array // caller must be sure that x is a valid array
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<block_size>::get_result(double *x) { void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
{
Timer t; Timer t;
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream); cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
if (verbosity > 2) { if (verbosity > 2) {
@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
} }
} // end get_result() } // end get_result()
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
SolverStatus cusparseSolverBackend<block_size>:: SolverStatus cusparseSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<double>> matrix, solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
double *b, Scalar* b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<double>& wellContribs, WellContributions<Scalar>& wellContribs,
BdaResult& res) BdaResult& res)
{ {
if (initialized == false) { if (initialized == false) {
@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
return SolverStatus::BDA_SOLVER_SUCCESS; return SolverStatus::BDA_SOLVER_SUCCESS;
} }
#define INSTANTIATE_TYPE(T) \
template class cusparseSolverBackend<T,1>; \
template class cusparseSolverBackend<T,2>; \
template class cusparseSolverBackend<T,3>; \
template class cusparseSolverBackend<T,4>; \
template class cusparseSolverBackend<T,5>; \
template class cusparseSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \ INSTANTIATE_TYPE(double)
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int); \
INSTANTIATE_BDA_FUNCTIONS(1); } // namespace Opm::Accelerator
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm

View File

@ -28,16 +28,13 @@
#include <opm/simulators/linalg/bda/BdaSolver.hpp> #include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp> #include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm namespace Opm::Accelerator {
{
namespace Accelerator
{
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
template <unsigned int block_size> template<class Scalar, unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<double,block_size> { class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
{
using Base = BdaSolver<double,block_size>; using Base = BdaSolver<Scalar,block_size>;
using Base::N; using Base::N;
using Base::Nb; using Base::Nb;
@ -57,13 +54,13 @@ private:
bsrilu02Info_t info_M; bsrilu02Info_t info_M;
bsrsv2Info_t info_L, info_U; bsrsv2Info_t info_L, info_U;
// b: bsr matrix, m: preconditioner // b: bsr matrix, m: preconditioner
double *d_bVals, *d_mVals; Scalar *d_bVals, *d_mVals;
int *d_bCols, *d_mCols; int *d_bCols, *d_mCols;
int *d_bRows, *d_mRows; int *d_bRows, *d_mRows;
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
double *d_pw, *d_s, *d_t, *d_v; Scalar *d_pw, *d_s, *d_t, *d_v;
void *d_buffer; void *d_buffer;
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
bool analysis_done = false; bool analysis_done = false;
@ -76,13 +73,13 @@ private:
/// Solve linear system using ilu0-bicgstab /// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res); void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory /// Initialize GPU and allocate memory
/// \param[in] matrix matrix for spmv /// \param[in] matrix matrix for spmv
/// \param[in] jacMatrix matrix for preconditioner /// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix<double>> matrix, void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<double>> jacMatrix); std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Clean memory /// Clean memory
void finalize(); void finalize();
@ -92,18 +89,18 @@ private:
/// \param[in] matrix matrix for spmv /// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values /// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner /// \param[in] jacMatrix matrix for preconditioner
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix, void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
double *b, Scalar* b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix); std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// also copy matrix for preconditioner if needed /// also copy matrix for preconditioner if needed
/// \param[in] matrix matrix for spmv /// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values /// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner /// \param[in] jacMatrix matrix for preconditioner
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix, void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
double *b, Scalar* b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix); std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Analyse sparsity pattern to extract parallelism /// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful /// \return true iff analysis was successful
@ -116,17 +113,16 @@ private:
/// Solve linear system /// Solve linear system
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
void solve_system(WellContributions<double>& wellContribs, BdaResult &res); void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
public: public:
/// Construct a cusparseSolver /// Construct a cusparseSolver
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver /// \param[in] linear_solver_verbosity verbosity of cusparseSolver
/// \param[in] maxit maximum number of iterations for cusparseSolver /// \param[in] maxit maximum number of iterations for cusparseSolver
/// \param[in] tolerance required relative tolerance for cusparseSolver /// \param[in] tolerance required relative tolerance for cusparseSolver
/// \param[in] deviceID the device to be used /// \param[in] deviceID the device to be used
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID); cusparseSolverBackend(int linear_solver_verbosity, int maxit,
Scalar tolerance, unsigned int deviceID);
/// Destroy a cusparseSolver, and free memory /// Destroy a cusparseSolver, and free memory
~cusparseSolverBackend(); ~cusparseSolverBackend();
@ -138,20 +134,19 @@ public:
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
/// \return status code /// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix, SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
double *b, Scalar* b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<double>& wellContribs, WellContributions<Scalar>& wellContribs,
BdaResult& res) override; BdaResult& res) override;
/// Get resulting vector x after linear solve, also includes post processing if necessary /// Get resulting vector x after linear solve, also includes post processing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override; void get_result(Scalar* x) override;
}; // end class cusparseSolverBackend }; // end class cusparseSolverBackend
} // namespace Accelerator } // namespace Opm::Accelerator
} // namespace Opm
#endif #endif