cusparseSolverBackend: add float Scalar support

This commit is contained in:
Arne Morten Kvarving 2024-04-16 10:29:33 +02:00
parent 3dbeed2199
commit 35fb78ea9a

View File

@ -39,6 +39,8 @@
#define COPY_ROW_BY_ROW 0 #define COPY_ROW_BY_ROW 0
#include <thread> #include <thread>
#include <type_traits>
extern std::shared_ptr<std::thread> copyThread; extern std::shared_ptr<std::thread> copyThread;
#if HAVE_OPENMP #if HAVE_OPENMP
@ -109,13 +111,27 @@ gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream); static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
} }
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r); if constexpr (std::is_same_v<Scalar,float>) {
cusparseSbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one,
descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
} else {
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one,
descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
}
cublasDscal(cublasHandle, n, &mone, d_r, 1); if constexpr (std::is_same_v<Scalar,float>) {
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1); cublasSscal(cublasHandle, n, &mone, d_r, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1); cublasSaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); cublasScopy(cublasHandle, n, d_r, 1, d_rw, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0); cublasScopy(cublasHandle, n, d_r, 1, d_p, 1);
cublasSnrm2(cublasHandle, n, d_r, 1, &norm_0);
} else {
cublasDscal(cublasHandle, n, &mone, d_r, 1);
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
}
if (verbosity > 1) { if (verbosity > 1) {
std::ostringstream out; std::ostringstream out;
@ -125,40 +141,80 @@ gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
for (it = 0.5; it < maxit; it += 0.5) { for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho; rhop = rho;
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho); if constexpr (std::is_same_v<Scalar,float>) {
cublasSdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
} else {
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
}
if (it > 1) { if (it > 1) {
beta = (rho / rhop) * (alpha / omega); beta = (rho / rhop) * (alpha / omega);
nomega = -omega; nomega = -omega;
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1); if constexpr (std::is_same_v<Scalar,float>) {
cublasDscal(cublasHandle, n, &beta, d_p, 1); cublasSaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1); cublasSscal(cublasHandle, n, &beta, d_p, 1);
cublasSaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
} else {
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
cublasDscal(cublasHandle, n, &beta, d_p, 1);
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
}
} }
// apply ilu0 if constexpr (std::is_same_v<Scalar,float>) {
cusparseDbsrsv2_solve(cusparseHandle, order, \ // apply ilu0
operation, Nb, nnzbs_prec, &one, \ cusparseSbsrsv2_solve(cusparseHandle, order,
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer); operation, Nb, nnzbs_prec, &one,
cusparseDbsrsv2_solve(cusparseHandle, order, \ descr_L, d_mVals, d_mRows, d_mCols, block_size,
operation, Nb, nnzbs_prec, &one, \ info_L, d_p, d_t, policy, d_buffer);
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer); cusparseSbsrsv2_solve(cusparseHandle, order,
operation, Nb, nnzbs_prec, &one,
// spmv descr_U, d_mVals, d_mRows, d_mCols, block_size,
cusparseDbsrmv(cusparseHandle, order, \ info_U, d_t, d_pw, policy, d_buffer);
operation, Nb, Nb, nnzb, \ // spmv
&one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v); cusparseSbsrmv(cusparseHandle, order,
operation, Nb, Nb, nnzb,
&one, descr_M, d_bVals, d_bRows,
d_bCols, block_size, d_pw, &zero, d_v);
} else {
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order,
operation, Nb, nnzbs_prec, &one,
descr_L, d_mVals, d_mRows, d_mCols, block_size,
info_L, d_p, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order,
operation, Nb, nnzbs_prec, &one,
descr_U, d_mVals, d_mRows, d_mCols, block_size,
info_U, d_t, d_pw, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order,
operation, Nb, Nb, nnzb,
&one, descr_M, d_bVals, d_bRows, d_bCols, block_size,
d_pw, &zero, d_v);
}
// apply wellContributions // apply wellContributions
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v); static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
} }
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1); if constexpr (std::is_same_v<Scalar,float>) {
cublasSdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
} else {
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
}
alpha = rho / tmp1; alpha = rho / tmp1;
nalpha = -alpha; nalpha = -alpha;
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1); if constexpr (std::is_same_v<Scalar,float>) {
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1); cublasSaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm); cublasSaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
cublasSnrm2(cublasHandle, n, d_r, 1, &norm);
} else {
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
}
if (norm < tolerance * norm_0) { if (norm < tolerance * norm_0) {
break; break;
@ -166,32 +222,65 @@ gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
it += 0.5; it += 0.5;
// apply ilu0 if constexpr (std::is_same_v<Scalar,float>) {
cusparseDbsrsv2_solve(cusparseHandle, order, \ // apply ilu0
operation, Nb, nnzbs_prec, &one, \ cusparseSbsrsv2_solve(cusparseHandle, order,
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer); operation, Nb, nnzbs_prec, &one,
cusparseDbsrsv2_solve(cusparseHandle, order, \ descr_L, d_mVals, d_mRows, d_mCols, block_size,
operation, Nb, nnzbs_prec, &one, \ info_L, d_r, d_t, policy, d_buffer);
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
// spmv cusparseSbsrsv2_solve(cusparseHandle, order,
cusparseDbsrmv(cusparseHandle, order, \ operation, Nb, nnzbs_prec, &one,
operation, Nb, Nb, nnzb, &one, descr_M, \ descr_U, d_mVals, d_mRows, d_mCols, block_size,
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t); info_U, d_t, d_s, policy, d_buffer);
// spmv
cusparseSbsrmv(cusparseHandle, order,
operation, Nb, Nb, nnzb, &one, descr_M,
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
} else {
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order,
operation, Nb, nnzbs_prec, &one,
descr_L, d_mVals, d_mRows, d_mCols, block_size,
info_L, d_r, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order,
operation, Nb, nnzbs_prec, &one,
descr_U, d_mVals, d_mRows, d_mCols, block_size,
info_U, d_t, d_s, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order,
operation, Nb, Nb, nnzb, &one, descr_M,
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
}
// apply wellContributions // apply wellContributions
if (wellContribs.getNumWells() > 0) { if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t); static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
} }
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1); if constexpr (std::is_same_v<Scalar,float>) {
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2); cublasSdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
cublasSdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
} else {
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
}
omega = tmp1 / tmp2; omega = tmp1 / tmp2;
nomega = -omega; nomega = -omega;
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm); if constexpr (std::is_same_v<Scalar,float>) {
cublasSaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
cublasSaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
cublasSnrm2(cublasHandle, n, d_r, 1, &norm);
} else {
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
}
if (norm < tolerance * norm_0) { if (norm < tolerance * norm_0) {
break; break;
@ -470,21 +559,42 @@ bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
cusparseCreateBsrsv2Info(&info_U); cusparseCreateBsrsv2Info(&info_U);
cudaCheckLastError("Could not create analysis info"); cudaCheckLastError("Could not create analysis info");
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec, if constexpr (std::is_same_v<Scalar,float>) {
descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M); cusparseSbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, block_size,
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L); info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec, cusparseSbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U); descr_L, d_mVals, d_mRows, d_mCols, block_size,
cudaCheckLastError(); info_L, &d_bufferSize_L);
cusparseSbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
descr_U, d_mVals, d_mRows, d_mCols, block_size,
info_U, &d_bufferSize_U);
} else {
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
descr_M, d_mVals, d_mRows, d_mCols, block_size,
info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
descr_L, d_mVals, d_mRows, d_mCols, block_size,
info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
descr_U, d_mVals, d_mRows, d_mCols, block_size,
info_U, &d_bufferSize_U);
}
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U)); d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
cudaMalloc((void**)&d_buffer, d_bufferSize); cudaMalloc((void**)&d_buffer, d_bufferSize);
// analysis of ilu LU decomposition // analysis of ilu LU decomposition
cusparseDbsrilu02_analysis(cusparseHandle, order, \ if constexpr (std::is_same_v<Scalar,float>) {
Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \ cusparseSbsrilu02_analysis(cusparseHandle, order,
block_size, info_M, policy, d_buffer); Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols,
block_size, info_M, policy, d_buffer);
} else {
cusparseDbsrilu02_analysis(cusparseHandle, order,
Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols,
block_size, info_M, policy, d_buffer);
}
int structural_zero; int structural_zero;
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero); cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
@ -493,13 +603,21 @@ bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
} }
// analysis of ilu apply // analysis of ilu apply
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ if constexpr (std::is_same_v<Scalar,float>) {
Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \ cusparseSbsrsv2_analysis(cusparseHandle, order, operation,
block_size, info_L, policy, d_buffer); Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols,
block_size, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ cusparseSbsrsv2_analysis(cusparseHandle, order, operation,
Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \ Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols,
block_size, info_U, policy, d_buffer); block_size, info_U, policy, d_buffer);
} else {
cusparseDbsrsv2_analysis(cusparseHandle, order, operation,
Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols,
block_size, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation,
Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols,
block_size, info_U, policy, d_buffer);
}
cudaCheckLastError("Could not analyse level information"); cudaCheckLastError("Could not analyse level information");
if (verbosity > 2) { if (verbosity > 2) {
@ -519,9 +637,15 @@ bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
{ {
Timer t; Timer t;
cusparseDbsrilu02(cusparseHandle, order, \ if constexpr (std::is_same_v<Scalar,float>) {
Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \ cusparseSbsrilu02(cusparseHandle, order,
block_size, info_M, policy, d_buffer); Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols,
block_size, info_M, policy, d_buffer);
} else {
cusparseDbsrilu02(cusparseHandle, order,
Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols,
block_size, info_M, policy, d_buffer);
}
cudaCheckLastError("Could not perform ilu decomposition"); cudaCheckLastError("Could not perform ilu decomposition");
int structural_zero; int structural_zero;