mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
cusparseSolverBackend: template Scalar type
This commit is contained in:
parent
18f42b51b2
commit
23250b87e3
@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
if (accelerator_mode.compare("cusparse") == 0) {
|
||||
#if HAVE_CUDA
|
||||
use_gpu = true;
|
||||
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
|
||||
using CU = Accelerator::cusparseSolverBackend<double,block_size>;
|
||||
backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
||||
#endif
|
||||
|
@ -44,23 +44,18 @@
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
cusparseSolverBackend<block_size>::
|
||||
template<class Scalar, unsigned int block_size>
|
||||
cusparseSolverBackend<Scalar, block_size>::
|
||||
cusparseSolverBackend(int verbosity_, int maxit_,
|
||||
double tolerance_, unsigned int deviceID_)
|
||||
Scalar tolerance_, unsigned int deviceID_)
|
||||
: Base(verbosity_, maxit_, tolerance_, deviceID_)
|
||||
{
|
||||
// initialize CUDA device, stream and libraries
|
||||
@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
std::ostringstream out;
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: "
|
||||
<< props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
|
||||
{
|
||||
finalize();
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::
|
||||
gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
Scalar rho = 1.0, rhop;
|
||||
Scalar alpha, nalpha, beta;
|
||||
Scalar omega, nomega, tmp1, tmp2;
|
||||
Scalar norm, norm_0;
|
||||
Scalar zero = 0.0;
|
||||
Scalar one = 1.0;
|
||||
Scalar mone = -1.0;
|
||||
float it;
|
||||
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
|
||||
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
|
||||
}
|
||||
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
||||
@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
|
||||
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
|
||||
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
break;
|
||||
}
|
||||
@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: "
|
||||
<< res.conv_rate << ", time: " << res.elapsed
|
||||
<< ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
|
||||
out << "Initializing GPU, matrix size: " << Nb
|
||||
<< " blockrows, nnz: " << nnzb << " blocks\n";
|
||||
if (useJacMatrix) {
|
||||
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
|
||||
}
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
out << "Maxit: " << maxit << std::scientific
|
||||
<< ", tolerance: " << tolerance << "\n";
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
|
||||
if (useJacMatrix) {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
|
||||
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
|
||||
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
|
||||
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
|
||||
} else {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
}
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
|
||||
cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
|
||||
cudaCheckLastError("Could not allocate pinned memory");
|
||||
#endif
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::finalize() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::finalize()
|
||||
{
|
||||
if (initialized) {
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
|
||||
}
|
||||
} // end finalize()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::
|
||||
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
#endif
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, d_bVals,
|
||||
nnz * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::
|
||||
update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
#endif
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
|
||||
{
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
Timer t;
|
||||
|
||||
@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::
|
||||
solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
// actually solve
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::get_result(double *x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus cusparseSolverBackend<block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
|
||||
WellContributions<double>& wellContribs,
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus cusparseSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class cusparseSolverBackend<T,1>; \
|
||||
template class cusparseSolverBackend<T,2>; \
|
||||
template class cusparseSolverBackend<T,3>; \
|
||||
template class cusparseSolverBackend<T,4>; \
|
||||
template class cusparseSolverBackend<T,5>; \
|
||||
template class cusparseSolverBackend<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int); \
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -28,16 +28,13 @@
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class cusparseSolverBackend : public BdaSolver<double,block_size> {
|
||||
|
||||
using Base = BdaSolver<double,block_size>;
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -57,13 +54,13 @@ private:
|
||||
bsrilu02Info_t info_M;
|
||||
bsrsv2Info_t info_L, info_U;
|
||||
// b: bsr matrix, m: preconditioner
|
||||
double *d_bVals, *d_mVals;
|
||||
Scalar *d_bVals, *d_mVals;
|
||||
int *d_bCols, *d_mCols;
|
||||
int *d_bRows, *d_mRows;
|
||||
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
double *d_pw, *d_s, *d_t, *d_v;
|
||||
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
Scalar *d_pw, *d_s, *d_t, *d_v;
|
||||
void *d_buffer;
|
||||
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
||||
Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
||||
|
||||
bool analysis_done = false;
|
||||
|
||||
@ -76,13 +73,13 @@ private:
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
|
||||
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
||||
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
@ -92,18 +89,18 @@ private:
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// also copy matrix for preconditioner if needed
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
@ -116,17 +113,16 @@ private:
|
||||
/// Solve linear system
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(WellContributions<double>& wellContribs, BdaResult &res);
|
||||
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
|
||||
|
||||
public:
|
||||
|
||||
|
||||
/// Construct a cusparseSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
||||
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
||||
/// \param[in] deviceID the device to be used
|
||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
|
||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit,
|
||||
Scalar tolerance, unsigned int deviceID);
|
||||
|
||||
/// Destroy a cusparseSolver, and free memory
|
||||
~cusparseSolverBackend();
|
||||
@ -138,20 +134,19 @@ public:
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
|
||||
WellContributions<double>& wellContribs,
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Get resulting vector x after linear solve, also includes post processing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
}; // end class cusparseSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user