cusparseSolverBackend: template Scalar type

This commit is contained in:
Arne Morten Kvarving 2024-04-16 10:29:33 +02:00
parent 18f42b51b2
commit 23250b87e3
3 changed files with 154 additions and 147 deletions

View File

@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
if (accelerator_mode.compare("cusparse") == 0) {
#if HAVE_CUDA
use_gpu = true;
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
using CU = Accelerator::cusparseSolverBackend<double,block_size>;
backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
#else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif

View File

@ -44,23 +44,18 @@
extern std::shared_ptr<std::thread> copyThread;
#endif // HAVE_OPENMP
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
template <unsigned int block_size>
cusparseSolverBackend<block_size>::
template<class Scalar, unsigned int block_size>
cusparseSolverBackend<Scalar, block_size>::
cusparseSolverBackend(int verbosity_, int maxit_,
double tolerance_, unsigned int deviceID_)
Scalar tolerance_, unsigned int deviceID_)
: Base(verbosity_, maxit_, tolerance_, deviceID_)
{
// initialize CUDA device, stream and libraries
@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties");
std::ostringstream out;
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
out << "Name GPU: " << props.name << ", Compute Capability: "
<< props.major << "." << props.minor;
OpmLog::info(out.str());
cudaStreamCreate(&stream);
@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
cudaCheckLastError("Could not set stream to cusparse");
}
template <unsigned int block_size>
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
template<class Scalar, unsigned int block_size>
cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
{
finalize();
}
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::
gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
int n = N;
double rho = 1.0, rhop;
double alpha, nalpha, beta;
double omega, nomega, tmp1, tmp2;
double norm, norm_0;
double zero = 0.0;
double one = 1.0;
double mone = -1.0;
Scalar rho = 1.0, rhop;
Scalar alpha, nalpha, beta;
Scalar omega, nomega, tmp1, tmp2;
Scalar norm, norm_0;
Scalar zero = 0.0;
Scalar one = 1.0;
Scalar mone = -1.0;
float it;
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
}
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
}
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
}
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if (norm < tolerance * norm_0) {
break;
}
@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
if (verbosity > 0) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
out << "=== converged: " << res.converged << ", conv_rate: "
<< res.conv_rate << ", time: " << res.elapsed
<< ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str());
}
}
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::
initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
}
std::ostringstream out;
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
out << "Initializing GPU, matrix size: " << Nb
<< " blockrows, nnz: " << nnzb << " blocks\n";
if (useJacMatrix) {
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
}
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "Maxit: " << maxit << std::scientific
<< ", tolerance: " << tolerance << "\n";
OpmLog::info(out.str());
cudaMalloc((void**)&d_x, sizeof(double) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N);
cudaMalloc((void**)&d_r, sizeof(double) * N);
cudaMalloc((void**)&d_rw, sizeof(double) * N);
cudaMalloc((void**)&d_p, sizeof(double) * N);
cudaMalloc((void**)&d_pw, sizeof(double) * N);
cudaMalloc((void**)&d_s, sizeof(double) * N);
cudaMalloc((void**)&d_t, sizeof(double) * N);
cudaMalloc((void**)&d_v, sizeof(double) * N);
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
if (useJacMatrix) {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
} else {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
d_mCols = d_bCols;
d_mRows = d_bRows;
}
cudaCheckLastError("Could not allocate enough memory on GPU");
#if COPY_ROW_BY_ROW
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
cudaCheckLastError("Could not allocate pinned memory");
#endif
initialized = true;
} // end initialize()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::finalize() {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::finalize()
{
if (initialized) {
cudaFree(d_x);
cudaFree(d_b);
@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
}
} // end finalize()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
Timer t;
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
#if COPY_ROW_BY_ROW
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, vals_contiguous,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
#if HAVE_OPENMP
if(omp_get_max_threads() > 1)
copyThread->join();
#endif
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
} else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_mVals, d_bVals,
nnz * sizeof(Scalar),
cudaMemcpyDeviceToDevice, stream);
}
#endif
if (useJacMatrix) {
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
cudaMemcpyHostToDevice, stream);
}
if (verbosity >= 3) {
@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::
update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
Timer t;
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
#if COPY_ROW_BY_ROW
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, vals_contiguous,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
#if HAVE_OPENMP
if(omp_get_max_threads() > 1)
copyThread->join();
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
} else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar),
cudaMemcpyDeviceToDevice, stream);
}
#endif
@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
}
} // end update_system_on_gpu()
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::analyse_matrix() {
template<class Scalar, unsigned int block_size>
bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
{
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
return true;
} // end analyse_matrix()
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() {
template<class Scalar, unsigned int block_size>
bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
{
Timer t;
cusparseDbsrilu02(cusparseHandle, order, \
@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
return true;
} // end create_preconditioner()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::
solve_system(WellContributions<double>& wellContribs, BdaResult& res)
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
// actually solve
gpu_pbicgstab(wellContribs, res);
@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::get_result(double *x) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
{
Timer t;
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if (verbosity > 2) {
@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus cusparseSolverBackend<block_size>::
solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
WellContributions<double>& wellContribs,
template<class Scalar, unsigned int block_size>
SolverStatus cusparseSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class cusparseSolverBackend<T,1>; \
template class cusparseSolverBackend<T,2>; \
template class cusparseSolverBackend<T,3>; \
template class cusparseSolverBackend<T,4>; \
template class cusparseSolverBackend<T,5>; \
template class cusparseSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int); \
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -28,16 +28,13 @@
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<double,block_size> {
using Base = BdaSolver<double,block_size>;
template<class Scalar, unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
{
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -57,13 +54,13 @@ private:
bsrilu02Info_t info_M;
bsrsv2Info_t info_L, info_U;
// b: bsr matrix, m: preconditioner
double *d_bVals, *d_mVals;
Scalar *d_bVals, *d_mVals;
int *d_bCols, *d_mCols;
int *d_bRows, *d_mRows;
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
double *d_pw, *d_s, *d_t, *d_v;
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
Scalar *d_pw, *d_s, *d_t, *d_v;
void *d_buffer;
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
bool analysis_done = false;
@ -76,13 +73,13 @@ private:
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix for spmv
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Clean memory
void finalize();
@ -92,18 +89,18 @@ private:
/// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// also copy matrix for preconditioner if needed
/// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful
@ -116,17 +113,16 @@ private:
/// Solve linear system
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(WellContributions<double>& wellContribs, BdaResult &res);
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
public:
/// Construct a cusparseSolver
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
/// \param[in] maxit maximum number of iterations for cusparseSolver
/// \param[in] tolerance required relative tolerance for cusparseSolver
/// \param[in] deviceID the device to be used
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
cusparseSolverBackend(int linear_solver_verbosity, int maxit,
Scalar tolerance, unsigned int deviceID);
/// Destroy a cusparseSolver, and free memory
~cusparseSolverBackend();
@ -138,20 +134,19 @@ public:
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
double *b,
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
WellContributions<double>& wellContribs,
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Get resulting vector x after linear solve, also includes post processing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
}; // end class cusparseSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif