diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 366e3e486..9d69b23a9 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -73,7 +73,8 @@ BdaBridge::BdaBridge(std::string acceler if (accelerator_mode.compare("cusparse") == 0) { #if HAVE_CUDA use_gpu = true; - backend.reset(new Opm::Accelerator::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance, deviceID)); + using CU = Accelerator::cusparseSolverBackend; + backend = std::make_unique(linear_solver_verbosity, maxit, tolerance, deviceID); #else OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake"); #endif diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu index d4d579bcf..bf6f5090a 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu @@ -44,23 +44,18 @@ extern std::shared_ptr copyThread; #endif // HAVE_OPENMP -namespace Opm -{ -namespace Accelerator -{ +namespace Opm::Accelerator { -using Opm::OpmLog; using Dune::Timer; const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW; - -template -cusparseSolverBackend:: +template +cusparseSolverBackend:: cusparseSolverBackend(int verbosity_, int maxit_, - double tolerance_, unsigned int deviceID_) + Scalar tolerance_, unsigned int deviceID_) : Base(verbosity_, maxit_, tolerance_, deviceID_) { // initialize CUDA device, stream and libraries @@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_, cudaGetDeviceProperties(&props, deviceID); cudaCheckLastError("Could not get device properties"); std::ostringstream out; - out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor; + out << "Name GPU: " << props.name << ", Compute Capability: " + << props.major << "." << props.minor; OpmLog::info(out.str()); cudaStreamCreate(&stream); @@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_, cudaCheckLastError("Could not set stream to cusparse"); } -template -cusparseSolverBackend::~cusparseSolverBackend() { +template +cusparseSolverBackend::~cusparseSolverBackend() +{ finalize(); } -template -void cusparseSolverBackend:: -gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) +template +void cusparseSolverBackend:: +gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); int n = N; - double rho = 1.0, rhop; - double alpha, nalpha, beta; - double omega, nomega, tmp1, tmp2; - double norm, norm_0; - double zero = 0.0; - double one = 1.0; - double mone = -1.0; + Scalar rho = 1.0, rhop; + Scalar alpha, nalpha, beta; + Scalar omega, nomega, tmp1, tmp2; + Scalar norm, norm_0; + Scalar zero = 0.0; + Scalar one = 1.0; + Scalar mone = -1.0; float it; if (wellContribs.getNumWells() > 0) { - static_cast&>(wellContribs).setCudaStream(stream); + static_cast&>(wellContribs).setCudaStream(stream); } cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r); @@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) // apply wellContributions if (wellContribs.getNumWells() > 0) { - static_cast&>(wellContribs).apply(d_pw, d_v); + static_cast&>(wellContribs).apply(d_pw, d_v); } cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1); @@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) // apply wellContributions if (wellContribs.getNumWells() > 0) { - static_cast&>(wellContribs).apply(d_s, d_t); + static_cast&>(wellContribs).apply(d_s, d_t); } cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1); @@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) cublasDnrm2(cublasHandle, n, d_r, 1, &norm); - if (norm < tolerance * norm_0) { break; } @@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) if (verbosity > 0) { std::ostringstream out; - out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \ - ", time per iteration: " << res.elapsed / it << ", iterations: " << it; + out << "=== converged: " << res.converged << ", conv_rate: " + << res.conv_rate << ", time: " << res.elapsed + << ", time per iteration: " << res.elapsed / it << ", iterations: " << it; OpmLog::info(out.str()); } } -template -void cusparseSolverBackend:: -initialize(std::shared_ptr> matrix, - std::shared_ptr> jacMatrix) +template +void cusparseSolverBackend:: +initialize(std::shared_ptr> matrix, + std::shared_ptr> jacMatrix) { this->Nb = matrix->Nb; this->N = Nb * block_size; @@ -239,46 +236,49 @@ initialize(std::shared_ptr> matrix, } std::ostringstream out; - out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n"; + out << "Initializing GPU, matrix size: " << Nb + << " blockrows, nnz: " << nnzb << " blocks\n"; if (useJacMatrix) { out << "Blocks in ILU matrix: " << nnzbs_prec << "\n"; } - out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; + out << "Maxit: " << maxit << std::scientific + << ", tolerance: " << tolerance << "\n"; OpmLog::info(out.str()); - cudaMalloc((void**)&d_x, sizeof(double) * N); - cudaMalloc((void**)&d_b, sizeof(double) * N); - cudaMalloc((void**)&d_r, sizeof(double) * N); - cudaMalloc((void**)&d_rw, sizeof(double) * N); - cudaMalloc((void**)&d_p, sizeof(double) * N); - cudaMalloc((void**)&d_pw, sizeof(double) * N); - cudaMalloc((void**)&d_s, sizeof(double) * N); - cudaMalloc((void**)&d_t, sizeof(double) * N); - cudaMalloc((void**)&d_v, sizeof(double) * N); - cudaMalloc((void**)&d_bVals, sizeof(double) * nnz); + cudaMalloc((void**)&d_x, sizeof(Scalar) * N); + cudaMalloc((void**)&d_b, sizeof(Scalar) * N); + cudaMalloc((void**)&d_r, sizeof(Scalar) * N); + cudaMalloc((void**)&d_rw, sizeof(Scalar) * N); + cudaMalloc((void**)&d_p, sizeof(Scalar) * N); + cudaMalloc((void**)&d_pw, sizeof(Scalar) * N); + cudaMalloc((void**)&d_s, sizeof(Scalar) * N); + cudaMalloc((void**)&d_t, sizeof(Scalar) * N); + cudaMalloc((void**)&d_v, sizeof(Scalar) * N); + cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz); cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb); cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1)); if (useJacMatrix) { - cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size); + cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size); cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec); cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1)); } else { - cudaMalloc((void**)&d_mVals, sizeof(double) * nnz); + cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz); d_mCols = d_bCols; d_mRows = d_bRows; } cudaCheckLastError("Could not allocate enough memory on GPU"); #if COPY_ROW_BY_ROW - cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz); + cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz); cudaCheckLastError("Could not allocate pinned memory"); #endif initialized = true; } // end initialize() -template -void cusparseSolverBackend::finalize() { +template +void cusparseSolverBackend::finalize() +{ if (initialized) { cudaFree(d_x); cudaFree(d_b); @@ -314,44 +314,54 @@ void cusparseSolverBackend::finalize() { } } // end finalize() - -template -void cusparseSolverBackend:: -copy_system_to_gpu(std::shared_ptr> matrix, - double *b, - std::shared_ptr> jacMatrix) +template +void cusparseSolverBackend:: +copy_system_to_gpu(std::shared_ptr> matrix, + Scalar* b, + std::shared_ptr> jacMatrix) { Timer t; - cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); - cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); + cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream); + cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream); #if COPY_ROW_BY_ROW int sum = 0; for (int i = 0; i < Nb; ++i) { int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i]; - memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size); + memcpy(vals_contiguous + sum, matrix->nnzValues + sum, + size_row * sizeof(Scalar) * block_size * block_size); sum += size_row * block_size * block_size; } - cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bVals, vals_contiguous, + nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream); #else - cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bVals, matrix->nnzValues, + nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream); if (useJacMatrix) { #if HAVE_OPENMP if(omp_get_max_threads() > 1) copyThread->join(); #endif - cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, + nnzbs_prec * block_size * block_size * sizeof(Scalar), + cudaMemcpyHostToDevice, stream); } else { - cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); + cudaMemcpyAsync(d_mVals, d_bVals, + nnz * sizeof(Scalar), + cudaMemcpyDeviceToDevice, stream); } #endif if (useJacMatrix) { - cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), + cudaMemcpyHostToDevice, stream); } if (verbosity >= 3) { @@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr> matrix, } } // end copy_system_to_gpu() - // don't copy rowpointers and colindices, they stay the same -template -void cusparseSolverBackend:: -update_system_on_gpu(std::shared_ptr> matrix, - double *b, - std::shared_ptr> jacMatrix) +template +void cusparseSolverBackend:: +update_system_on_gpu(std::shared_ptr> matrix, + Scalar* b, + std::shared_ptr> jacMatrix) { Timer t; - cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); - cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); + cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream); + cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream); #if COPY_ROW_BY_ROW int sum = 0; for (int i = 0; i < Nb; ++i) { int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i]; - memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size); + memcpy(vals_contiguous + sum, matrix->nnzValues + sum, + size_row * sizeof(Scalar) * block_size * block_size); sum += size_row * block_size * block_size; } - cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bVals, vals_contiguous, + nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream); #else - cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bVals, matrix->nnzValues, + nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream); if (useJacMatrix) { #if HAVE_OPENMP - if(omp_get_max_threads() > 1) - copyThread->join(); + if (omp_get_max_threads() > 1) { + copyThread->join(); + } #endif - cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, + nnzbs_prec * block_size * block_size * sizeof(Scalar), + cudaMemcpyHostToDevice, stream); } else { - cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); + cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar), + cudaMemcpyDeviceToDevice, stream); } #endif @@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr> matrix, } } // end update_system_on_gpu() - -template -bool cusparseSolverBackend::analyse_matrix() { - +template +bool cusparseSolverBackend::analyse_matrix() +{ int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; Timer t; @@ -487,8 +502,9 @@ bool cusparseSolverBackend::analyse_matrix() { return true; } // end analyse_matrix() -template -bool cusparseSolverBackend::create_preconditioner() { +template +bool cusparseSolverBackend::create_preconditioner() +{ Timer t; cusparseDbsrilu02(cusparseHandle, order, \ @@ -512,10 +528,9 @@ bool cusparseSolverBackend::create_preconditioner() { return true; } // end create_preconditioner() - -template -void cusparseSolverBackend:: -solve_system(WellContributions& wellContribs, BdaResult& res) +template +void cusparseSolverBackend:: +solve_system(WellContributions& wellContribs, BdaResult& res) { // actually solve gpu_pbicgstab(wellContribs, res); @@ -523,14 +538,14 @@ solve_system(WellContributions& wellContribs, BdaResult& res) cudaCheckLastError("Something went wrong during the GPU solve"); } // end solve_system() - // copy result to host memory // caller must be sure that x is a valid array -template -void cusparseSolverBackend::get_result(double *x) { +template +void cusparseSolverBackend::get_result(Scalar* x) +{ Timer t; - cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); if (verbosity > 2) { @@ -540,12 +555,12 @@ void cusparseSolverBackend::get_result(double *x) { } } // end get_result() -template -SolverStatus cusparseSolverBackend:: -solve_system(std::shared_ptr> matrix, - double *b, - std::shared_ptr> jacMatrix, - WellContributions& wellContribs, +template +SolverStatus cusparseSolverBackend:: +solve_system(std::shared_ptr> matrix, + Scalar* b, + std::shared_ptr> jacMatrix, + WellContributions& wellContribs, BdaResult& res) { if (initialized == false) { @@ -567,18 +582,14 @@ solve_system(std::shared_ptr> matrix, return SolverStatus::BDA_SOLVER_SUCCESS; } +#define INSTANTIATE_TYPE(T) \ + template class cusparseSolverBackend; \ + template class cusparseSolverBackend; \ + template class cusparseSolverBackend; \ + template class cusparseSolverBackend; \ + template class cusparseSolverBackend; \ + template class cusparseSolverBackend; -#define INSTANTIATE_BDA_FUNCTIONS(n) \ -template cusparseSolverBackend::cusparseSolverBackend(int, int, double, unsigned int); \ +INSTANTIATE_TYPE(double) -INSTANTIATE_BDA_FUNCTIONS(1); -INSTANTIATE_BDA_FUNCTIONS(2); -INSTANTIATE_BDA_FUNCTIONS(3); -INSTANTIATE_BDA_FUNCTIONS(4); -INSTANTIATE_BDA_FUNCTIONS(5); -INSTANTIATE_BDA_FUNCTIONS(6); - -#undef INSTANTIATE_BDA_FUNCTIONS - -} // namespace Accelerator -} // namespace Opm +} // namespace Opm::Accelerator diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp index 748955192..48c9c9343 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp @@ -28,16 +28,13 @@ #include #include -namespace Opm -{ -namespace Accelerator -{ +namespace Opm::Accelerator { /// This class implements a cusparse-based ilu0-bicgstab solver on GPU -template -class cusparseSolverBackend : public BdaSolver { - - using Base = BdaSolver; +template +class cusparseSolverBackend : public BdaSolver +{ + using Base = BdaSolver; using Base::N; using Base::Nb; @@ -57,13 +54,13 @@ private: bsrilu02Info_t info_M; bsrsv2Info_t info_L, info_U; // b: bsr matrix, m: preconditioner - double *d_bVals, *d_mVals; + Scalar *d_bVals, *d_mVals; int *d_bCols, *d_mCols; int *d_bRows, *d_mRows; - double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve - double *d_pw, *d_s, *d_t, *d_v; + Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve + Scalar *d_pw, *d_s, *d_t, *d_v; void *d_buffer; - double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp + Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp bool analysis_done = false; @@ -76,13 +73,13 @@ private: /// Solve linear system using ilu0-bicgstab /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); + void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); /// Initialize GPU and allocate memory /// \param[in] matrix matrix for spmv /// \param[in] jacMatrix matrix for preconditioner - void initialize(std::shared_ptr> matrix, - std::shared_ptr> jacMatrix); + void initialize(std::shared_ptr> matrix, + std::shared_ptr> jacMatrix); /// Clean memory void finalize(); @@ -92,18 +89,18 @@ private: /// \param[in] matrix matrix for spmv /// \param[in] b input vector, contains N values /// \param[in] jacMatrix matrix for preconditioner - void copy_system_to_gpu(std::shared_ptr> matrix, - double *b, - std::shared_ptr> jacMatrix); + void copy_system_to_gpu(std::shared_ptr> matrix, + Scalar* b, + std::shared_ptr> jacMatrix); /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same /// also copy matrix for preconditioner if needed /// \param[in] matrix matrix for spmv /// \param[in] b input vector, contains N values /// \param[in] jacMatrix matrix for preconditioner - void update_system_on_gpu(std::shared_ptr> matrix, - double *b, - std::shared_ptr> jacMatrix); + void update_system_on_gpu(std::shared_ptr> matrix, + Scalar* b, + std::shared_ptr> jacMatrix); /// Analyse sparsity pattern to extract parallelism /// \return true iff analysis was successful @@ -116,17 +113,16 @@ private: /// Solve linear system /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void solve_system(WellContributions& wellContribs, BdaResult &res); + void solve_system(WellContributions& wellContribs, BdaResult &res); public: - - /// Construct a cusparseSolver /// \param[in] linear_solver_verbosity verbosity of cusparseSolver /// \param[in] maxit maximum number of iterations for cusparseSolver /// \param[in] tolerance required relative tolerance for cusparseSolver /// \param[in] deviceID the device to be used - cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID); + cusparseSolverBackend(int linear_solver_verbosity, int maxit, + Scalar tolerance, unsigned int deviceID); /// Destroy a cusparseSolver, and free memory ~cusparseSolverBackend(); @@ -138,20 +134,19 @@ public: /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result /// \return status code - SolverStatus solve_system(std::shared_ptr> matrix, - double *b, - std::shared_ptr> jacMatrix, - WellContributions& wellContribs, + SolverStatus solve_system(std::shared_ptr> matrix, + Scalar* b, + std::shared_ptr> jacMatrix, + WellContributions& wellContribs, BdaResult& res) override; /// Get resulting vector x after linear solve, also includes post processing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array - void get_result(double *x) override; + void get_result(Scalar* x) override; }; // end class cusparseSolverBackend -} // namespace Accelerator -} // namespace Opm +} // namespace Opm::Accelerator #endif