mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
cusparseSolverBackend: template Scalar type
This commit is contained in:
parent
18f42b51b2
commit
23250b87e3
@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
|||||||
if (accelerator_mode.compare("cusparse") == 0) {
|
if (accelerator_mode.compare("cusparse") == 0) {
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
use_gpu = true;
|
use_gpu = true;
|
||||||
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
|
using CU = Accelerator::cusparseSolverBackend<double,block_size>;
|
||||||
|
backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
|
||||||
#else
|
#else
|
||||||
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
||||||
#endif
|
#endif
|
||||||
|
@ -44,23 +44,18 @@
|
|||||||
extern std::shared_ptr<std::thread> copyThread;
|
extern std::shared_ptr<std::thread> copyThread;
|
||||||
#endif // HAVE_OPENMP
|
#endif // HAVE_OPENMP
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm::Accelerator {
|
||||||
{
|
|
||||||
namespace Accelerator
|
|
||||||
{
|
|
||||||
|
|
||||||
using Opm::OpmLog;
|
|
||||||
using Dune::Timer;
|
using Dune::Timer;
|
||||||
|
|
||||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||||
|
|
||||||
|
template<class Scalar, unsigned int block_size>
|
||||||
template <unsigned int block_size>
|
cusparseSolverBackend<Scalar, block_size>::
|
||||||
cusparseSolverBackend<block_size>::
|
|
||||||
cusparseSolverBackend(int verbosity_, int maxit_,
|
cusparseSolverBackend(int verbosity_, int maxit_,
|
||||||
double tolerance_, unsigned int deviceID_)
|
Scalar tolerance_, unsigned int deviceID_)
|
||||||
: Base(verbosity_, maxit_, tolerance_, deviceID_)
|
: Base(verbosity_, maxit_, tolerance_, deviceID_)
|
||||||
{
|
{
|
||||||
// initialize CUDA device, stream and libraries
|
// initialize CUDA device, stream and libraries
|
||||||
@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
|
|||||||
cudaGetDeviceProperties(&props, deviceID);
|
cudaGetDeviceProperties(&props, deviceID);
|
||||||
cudaCheckLastError("Could not get device properties");
|
cudaCheckLastError("Could not get device properties");
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
out << "Name GPU: " << props.name << ", Compute Capability: "
|
||||||
|
<< props.major << "." << props.minor;
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
|
|
||||||
cudaStreamCreate(&stream);
|
cudaStreamCreate(&stream);
|
||||||
@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
|
|||||||
cudaCheckLastError("Could not set stream to cusparse");
|
cudaCheckLastError("Could not set stream to cusparse");
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
|
cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
|
||||||
|
{
|
||||||
finalize();
|
finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
void cusparseSolverBackend<block_size>::
|
void cusparseSolverBackend<Scalar,block_size>::
|
||||||
gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||||
{
|
{
|
||||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||||
int n = N;
|
int n = N;
|
||||||
double rho = 1.0, rhop;
|
Scalar rho = 1.0, rhop;
|
||||||
double alpha, nalpha, beta;
|
Scalar alpha, nalpha, beta;
|
||||||
double omega, nomega, tmp1, tmp2;
|
Scalar omega, nomega, tmp1, tmp2;
|
||||||
double norm, norm_0;
|
Scalar norm, norm_0;
|
||||||
double zero = 0.0;
|
Scalar zero = 0.0;
|
||||||
double one = 1.0;
|
Scalar one = 1.0;
|
||||||
double mone = -1.0;
|
Scalar mone = -1.0;
|
||||||
float it;
|
float it;
|
||||||
|
|
||||||
if (wellContribs.getNumWells() > 0) {
|
if (wellContribs.getNumWells() > 0) {
|
||||||
static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
|
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
||||||
@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
|||||||
|
|
||||||
// apply wellContributions
|
// apply wellContributions
|
||||||
if (wellContribs.getNumWells() > 0) {
|
if (wellContribs.getNumWells() > 0) {
|
||||||
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
|
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
|
||||||
}
|
}
|
||||||
|
|
||||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||||
@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
|||||||
|
|
||||||
// apply wellContributions
|
// apply wellContributions
|
||||||
if (wellContribs.getNumWells() > 0) {
|
if (wellContribs.getNumWells() > 0) {
|
||||||
static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
|
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
|
||||||
}
|
}
|
||||||
|
|
||||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||||
@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
|||||||
|
|
||||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||||
|
|
||||||
|
|
||||||
if (norm < tolerance * norm_0) {
|
if (norm < tolerance * norm_0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
|||||||
|
|
||||||
if (verbosity > 0) {
|
if (verbosity > 0) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
out << "=== converged: " << res.converged << ", conv_rate: "
|
||||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
<< res.conv_rate << ", time: " << res.elapsed
|
||||||
|
<< ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
void cusparseSolverBackend<block_size>::
|
void cusparseSolverBackend<Scalar,block_size>::
|
||||||
initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||||
{
|
{
|
||||||
this->Nb = matrix->Nb;
|
this->Nb = matrix->Nb;
|
||||||
this->N = Nb * block_size;
|
this->N = Nb * block_size;
|
||||||
@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
|
out << "Initializing GPU, matrix size: " << Nb
|
||||||
|
<< " blockrows, nnz: " << nnzb << " blocks\n";
|
||||||
if (useJacMatrix) {
|
if (useJacMatrix) {
|
||||||
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
|
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
|
||||||
}
|
}
|
||||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
out << "Maxit: " << maxit << std::scientific
|
||||||
|
<< ", tolerance: " << tolerance << "\n";
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
|
|
||||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_rw, sizeof(double) * N);
|
cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_pw, sizeof(double) * N);
|
cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
|
||||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
|
||||||
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
|
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
|
||||||
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
|
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
|
||||||
if (useJacMatrix) {
|
if (useJacMatrix) {
|
||||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
|
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
|
||||||
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
|
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
|
||||||
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
|
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
|
||||||
} else {
|
} else {
|
||||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
|
||||||
d_mCols = d_bCols;
|
d_mCols = d_bCols;
|
||||||
d_mRows = d_bRows;
|
d_mRows = d_bRows;
|
||||||
}
|
}
|
||||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||||
|
|
||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
|
cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
|
||||||
cudaCheckLastError("Could not allocate pinned memory");
|
cudaCheckLastError("Could not allocate pinned memory");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
initialized = true;
|
initialized = true;
|
||||||
} // end initialize()
|
} // end initialize()
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
void cusparseSolverBackend<block_size>::finalize() {
|
void cusparseSolverBackend<Scalar,block_size>::finalize()
|
||||||
|
{
|
||||||
if (initialized) {
|
if (initialized) {
|
||||||
cudaFree(d_x);
|
cudaFree(d_x);
|
||||||
cudaFree(d_b);
|
cudaFree(d_b);
|
||||||
@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
|
|||||||
}
|
}
|
||||||
} // end finalize()
|
} // end finalize()
|
||||||
|
|
||||||
|
template<class Scalar, unsigned int block_size>
|
||||||
template <unsigned int block_size>
|
void cusparseSolverBackend<Scalar,block_size>::
|
||||||
void cusparseSolverBackend<block_size>::
|
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
Scalar* b,
|
||||||
double *b,
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
|
||||||
{
|
{
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
|
||||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyHostToDevice, stream);
|
||||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
|
||||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
cudaMemcpyHostToDevice, stream);
|
||||||
|
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||||
|
cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
|
||||||
|
|
||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int i = 0; i < Nb; ++i) {
|
for (int i = 0; i < Nb; ++i) {
|
||||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
|
||||||
|
size_row * sizeof(Scalar) * block_size * block_size);
|
||||||
sum += size_row * block_size * block_size;
|
sum += size_row * block_size * block_size;
|
||||||
}
|
}
|
||||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bVals, vals_contiguous,
|
||||||
|
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||||
#else
|
#else
|
||||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||||
|
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||||
if (useJacMatrix) {
|
if (useJacMatrix) {
|
||||||
#if HAVE_OPENMP
|
#if HAVE_OPENMP
|
||||||
if(omp_get_max_threads() > 1)
|
if(omp_get_max_threads() > 1)
|
||||||
copyThread->join();
|
copyThread->join();
|
||||||
#endif
|
#endif
|
||||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||||
|
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream);
|
||||||
} else {
|
} else {
|
||||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
cudaMemcpyAsync(d_mVals, d_bVals,
|
||||||
|
nnz * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToDevice, stream);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (useJacMatrix) {
|
if (useJacMatrix) {
|
||||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
|
||||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyHostToDevice, stream);
|
||||||
|
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
|
||||||
|
cudaMemcpyHostToDevice, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
|||||||
}
|
}
|
||||||
} // end copy_system_to_gpu()
|
} // end copy_system_to_gpu()
|
||||||
|
|
||||||
|
|
||||||
// don't copy rowpointers and colindices, they stay the same
|
// don't copy rowpointers and colindices, they stay the same
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
void cusparseSolverBackend<block_size>::
|
void cusparseSolverBackend<Scalar,block_size>::
|
||||||
update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
double *b,
|
Scalar* b,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix)
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||||
{
|
{
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
|
||||||
|
|
||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int i = 0; i < Nb; ++i) {
|
for (int i = 0; i < Nb; ++i) {
|
||||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
|
||||||
|
size_row * sizeof(Scalar) * block_size * block_size);
|
||||||
sum += size_row * block_size * block_size;
|
sum += size_row * block_size * block_size;
|
||||||
}
|
}
|
||||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bVals, vals_contiguous,
|
||||||
|
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||||
#else
|
#else
|
||||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||||
|
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||||
if (useJacMatrix) {
|
if (useJacMatrix) {
|
||||||
#if HAVE_OPENMP
|
#if HAVE_OPENMP
|
||||||
if(omp_get_max_threads() > 1)
|
if (omp_get_max_threads() > 1) {
|
||||||
copyThread->join();
|
copyThread->join();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||||
|
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream);
|
||||||
} else {
|
} else {
|
||||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToDevice, stream);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
|||||||
}
|
}
|
||||||
} // end update_system_on_gpu()
|
} // end update_system_on_gpu()
|
||||||
|
|
||||||
|
template<class Scalar, unsigned int block_size>
|
||||||
template <unsigned int block_size>
|
bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
|
||||||
bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
{
|
||||||
|
|
||||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
|||||||
return true;
|
return true;
|
||||||
} // end analyse_matrix()
|
} // end analyse_matrix()
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
|
||||||
|
{
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
cusparseDbsrilu02(cusparseHandle, order, \
|
cusparseDbsrilu02(cusparseHandle, order, \
|
||||||
@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
|||||||
return true;
|
return true;
|
||||||
} // end create_preconditioner()
|
} // end create_preconditioner()
|
||||||
|
|
||||||
|
template<class Scalar, unsigned int block_size>
|
||||||
template <unsigned int block_size>
|
void cusparseSolverBackend<Scalar,block_size>::
|
||||||
void cusparseSolverBackend<block_size>::
|
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||||
solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
|
||||||
{
|
{
|
||||||
// actually solve
|
// actually solve
|
||||||
gpu_pbicgstab(wellContribs, res);
|
gpu_pbicgstab(wellContribs, res);
|
||||||
@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
|
|||||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||||
} // end solve_system()
|
} // end solve_system()
|
||||||
|
|
||||||
|
|
||||||
// copy result to host memory
|
// copy result to host memory
|
||||||
// caller must be sure that x is a valid array
|
// caller must be sure that x is a valid array
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
void cusparseSolverBackend<block_size>::get_result(double *x) {
|
void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
|
||||||
|
{
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
|
||||||
cudaStreamSynchronize(stream);
|
cudaStreamSynchronize(stream);
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
|
|||||||
}
|
}
|
||||||
} // end get_result()
|
} // end get_result()
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
SolverStatus cusparseSolverBackend<block_size>::
|
SolverStatus cusparseSolverBackend<Scalar,block_size>::
|
||||||
solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
double *b,
|
Scalar* b,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||||
WellContributions<double>& wellContribs,
|
WellContributions<Scalar>& wellContribs,
|
||||||
BdaResult& res)
|
BdaResult& res)
|
||||||
{
|
{
|
||||||
if (initialized == false) {
|
if (initialized == false) {
|
||||||
@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
|||||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define INSTANTIATE_TYPE(T) \
|
||||||
|
template class cusparseSolverBackend<T,1>; \
|
||||||
|
template class cusparseSolverBackend<T,2>; \
|
||||||
|
template class cusparseSolverBackend<T,3>; \
|
||||||
|
template class cusparseSolverBackend<T,4>; \
|
||||||
|
template class cusparseSolverBackend<T,5>; \
|
||||||
|
template class cusparseSolverBackend<T,6>;
|
||||||
|
|
||||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
INSTANTIATE_TYPE(double)
|
||||||
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int); \
|
|
||||||
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
} // namespace Opm::Accelerator
|
||||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
|
||||||
|
|
||||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
|
||||||
|
|
||||||
} // namespace Accelerator
|
|
||||||
} // namespace Opm
|
|
||||||
|
@ -28,16 +28,13 @@
|
|||||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm::Accelerator {
|
||||||
{
|
|
||||||
namespace Accelerator
|
|
||||||
{
|
|
||||||
|
|
||||||
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
|
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
|
||||||
template <unsigned int block_size>
|
template<class Scalar, unsigned int block_size>
|
||||||
class cusparseSolverBackend : public BdaSolver<double,block_size> {
|
class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
|
||||||
|
{
|
||||||
using Base = BdaSolver<double,block_size>;
|
using Base = BdaSolver<Scalar,block_size>;
|
||||||
|
|
||||||
using Base::N;
|
using Base::N;
|
||||||
using Base::Nb;
|
using Base::Nb;
|
||||||
@ -57,13 +54,13 @@ private:
|
|||||||
bsrilu02Info_t info_M;
|
bsrilu02Info_t info_M;
|
||||||
bsrsv2Info_t info_L, info_U;
|
bsrsv2Info_t info_L, info_U;
|
||||||
// b: bsr matrix, m: preconditioner
|
// b: bsr matrix, m: preconditioner
|
||||||
double *d_bVals, *d_mVals;
|
Scalar *d_bVals, *d_mVals;
|
||||||
int *d_bCols, *d_mCols;
|
int *d_bCols, *d_mCols;
|
||||||
int *d_bRows, *d_mRows;
|
int *d_bRows, *d_mRows;
|
||||||
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||||
double *d_pw, *d_s, *d_t, *d_v;
|
Scalar *d_pw, *d_s, *d_t, *d_v;
|
||||||
void *d_buffer;
|
void *d_buffer;
|
||||||
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
||||||
|
|
||||||
bool analysis_done = false;
|
bool analysis_done = false;
|
||||||
|
|
||||||
@ -76,13 +73,13 @@ private:
|
|||||||
/// Solve linear system using ilu0-bicgstab
|
/// Solve linear system using ilu0-bicgstab
|
||||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[inout] res summary of solver result
|
||||||
void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
|
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||||
|
|
||||||
/// Initialize GPU and allocate memory
|
/// Initialize GPU and allocate memory
|
||||||
/// \param[in] matrix matrix for spmv
|
/// \param[in] matrix matrix for spmv
|
||||||
/// \param[in] jacMatrix matrix for preconditioner
|
/// \param[in] jacMatrix matrix for preconditioner
|
||||||
void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
|
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||||
|
|
||||||
/// Clean memory
|
/// Clean memory
|
||||||
void finalize();
|
void finalize();
|
||||||
@ -92,18 +89,18 @@ private:
|
|||||||
/// \param[in] matrix matrix for spmv
|
/// \param[in] matrix matrix for spmv
|
||||||
/// \param[in] b input vector, contains N values
|
/// \param[in] b input vector, contains N values
|
||||||
/// \param[in] jacMatrix matrix for preconditioner
|
/// \param[in] jacMatrix matrix for preconditioner
|
||||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
double *b,
|
Scalar* b,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||||
|
|
||||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||||
/// also copy matrix for preconditioner if needed
|
/// also copy matrix for preconditioner if needed
|
||||||
/// \param[in] matrix matrix for spmv
|
/// \param[in] matrix matrix for spmv
|
||||||
/// \param[in] b input vector, contains N values
|
/// \param[in] b input vector, contains N values
|
||||||
/// \param[in] jacMatrix matrix for preconditioner
|
/// \param[in] jacMatrix matrix for preconditioner
|
||||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
|
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
double *b,
|
Scalar* b,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix);
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||||
|
|
||||||
/// Analyse sparsity pattern to extract parallelism
|
/// Analyse sparsity pattern to extract parallelism
|
||||||
/// \return true iff analysis was successful
|
/// \return true iff analysis was successful
|
||||||
@ -116,17 +113,16 @@ private:
|
|||||||
/// Solve linear system
|
/// Solve linear system
|
||||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[inout] res summary of solver result
|
||||||
void solve_system(WellContributions<double>& wellContribs, BdaResult &res);
|
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
||||||
/// Construct a cusparseSolver
|
/// Construct a cusparseSolver
|
||||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||||
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
||||||
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
||||||
/// \param[in] deviceID the device to be used
|
/// \param[in] deviceID the device to be used
|
||||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
|
cusparseSolverBackend(int linear_solver_verbosity, int maxit,
|
||||||
|
Scalar tolerance, unsigned int deviceID);
|
||||||
|
|
||||||
/// Destroy a cusparseSolver, and free memory
|
/// Destroy a cusparseSolver, and free memory
|
||||||
~cusparseSolverBackend();
|
~cusparseSolverBackend();
|
||||||
@ -138,20 +134,19 @@ public:
|
|||||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[inout] res summary of solver result
|
||||||
/// \return status code
|
/// \return status code
|
||||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
|
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||||
double *b,
|
Scalar* b,
|
||||||
std::shared_ptr<BlockedMatrix<double>> jacMatrix,
|
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||||
WellContributions<double>& wellContribs,
|
WellContributions<Scalar>& wellContribs,
|
||||||
BdaResult& res) override;
|
BdaResult& res) override;
|
||||||
|
|
||||||
/// Get resulting vector x after linear solve, also includes post processing if necessary
|
/// Get resulting vector x after linear solve, also includes post processing if necessary
|
||||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||||
void get_result(double *x) override;
|
void get_result(Scalar* x) override;
|
||||||
|
|
||||||
}; // end class cusparseSolverBackend
|
}; // end class cusparseSolverBackend
|
||||||
|
|
||||||
} // namespace Accelerator
|
} // namespace Opm::Accelerator
|
||||||
} // namespace Opm
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user