From 91a3e238ce8fc49cf2b014f76ff87496e8f6e663 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Wed, 18 Jan 2023 10:11:42 +0100 Subject: [PATCH] Add rocsparseSolver --- CMakeLists.txt | 13 + CMakeLists_files.cmake | 4 + opm-simulators-prereqs.cmake | 4 + opm/simulators/linalg/bda/BdaBridge.cpp | 11 + .../linalg/bda/WellContributions.cpp | 6 + .../linalg/bda/rocalutionSolverBackend.cpp | 6 + .../linalg/bda/rocsparseSolverBackend.cpp | 556 ++++++++++++++++++ .../linalg/bda/rocsparseSolverBackend.hpp | 153 +++++ 8 files changed, 753 insertions(+) create mode 100644 opm/simulators/linalg/bda/rocsparseSolverBackend.cpp create mode 100644 opm/simulators/linalg/bda/rocsparseSolverBackend.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 477d5c23c..a061f36cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,6 +209,14 @@ else() endif() endif() +find_package(rocblas) +find_package(rocsparse) + +if(rocsparse_FOUND AND rocblas_FOUND) + set(HAVE_ROCSPARSE 1) + set(COMPILE_BDA_BRIDGE 1) +endif() + find_package(amgcl) if(amgcl_FOUND) set(HAVE_AMGCL 1) @@ -531,6 +539,11 @@ if(ROCALUTION_FOUND) target_include_directories(opmsimulators PUBLIC ${rocalution_INCLUDE_DIR}/rocalution) endif() +if(rocsparse_FOUND AND rocblas_FOUND) + target_link_libraries( opmsimulators PUBLIC rocsparse ) + target_link_libraries( opmsimulators PUBLIC rocblas ) +endif() + if(VexCL_FOUND) target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL ) endif() diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 60b5c1603..836b09fad 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -161,6 +161,9 @@ endif() if(ROCALUTION_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp) endif() +if(rocsparse_FOUND AND rocblas_FOUND) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp) +endif() if(COMPILE_BDA_BRIDGE) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) endif() @@ -337,6 +340,7 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/Matrix.hpp opm/simulators/linalg/bda/MultisegmentWellContribution.hpp opm/simulators/linalg/bda/rocalutionSolverBackend.hpp + opm/simulators/linalg/bda/rocsparseSolverBackend.hpp opm/simulators/linalg/bda/WellContributions.hpp opm/simulators/linalg/amgcpr.hh opm/simulators/linalg/twolevelmethodcpr.hh diff --git a/opm-simulators-prereqs.cmake b/opm-simulators-prereqs.cmake index dfb5f1711..6f00c8d0b 100644 --- a/opm-simulators-prereqs.cmake +++ b/opm-simulators-prereqs.cmake @@ -12,6 +12,7 @@ set (opm-simulators_CONFIG_VAR HAVE_AMGCL HAVE_VEXCL HAVE_ROCALUTION + HAVE_ROCSPARSE HAVE_SUITESPARSE_UMFPACK_H HAVE_DUNE_ISTL DUNE_ISTL_WITH_CHECKING @@ -44,6 +45,9 @@ set (opm-simulators_DEPS "SuperLU" # ROCALUTION from ROCM framework "rocalution" + # packages from ROCm framework + "rocblas" + "rocsparse" # OPM dependency "opm-common REQUIRED" "opm-grid REQUIRED" diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 0d1a7e18c..e3752133e 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -45,6 +45,10 @@ #include #endif +#if HAVE_ROCSPARSE +#include +#endif + typedef Dune::InverseOperatorResult InverseOperatorResult; namespace Opm @@ -91,6 +95,13 @@ BdaBridge::BdaBridge(std::string acceler backend.reset(new Opm::Accelerator::rocalutionSolverBackend(linear_solver_verbosity, maxit, tolerance)); #else OPM_THROW(std::logic_error, "Error rocalutionSolver was chosen, but rocalution was not found by CMake"); +#endif + } else if (accelerator_mode.compare("rocsparse") == 0) { +#if HAVE_ROCSPARSE + use_gpu = true; // should be replaced by a 'use_bridge' boolean + backend.reset(new Opm::Accelerator::rocsparseSolverBackend(linear_solver_verbosity, maxit, tolerance, platformID, deviceID)); +#else + OPM_THROW(std::logic_error, "Error openclSolver was chosen, but rocsparse was not found by CMake"); #endif } else if (accelerator_mode.compare("none") == 0) { use_gpu = false; diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index bd3011c6f..4e0675b40 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -52,6 +52,12 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn) OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled"); #endif } + else if(accelerator_mode.compare("rocsparse") == 0){ + if (!useWellConn) { + OPM_THROW(std::logic_error, "Error rocsparse requires --matrix-add-well-contributions=true"); + } + return std::make_unique(); + } else if(accelerator_mode.compare("amgcl") == 0){ if (!useWellConn) { OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true"); diff --git a/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp b/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp index 95b372e67..c09546357 100644 --- a/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp +++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp @@ -26,6 +26,12 @@ #include #include +// WellContributions are included via the solver +// MultisegmentWellContribution includes the cuda runtime if found by CMake +// this leads to inclusion of both amd_hip_vector_types.h and vector_types.h +// which both define vector types like uchar2, short3 and double4. +#undef HAVE_CUDA + #include #include diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp new file mode 100644 index 000000000..e231d2390 --- /dev/null +++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp @@ -0,0 +1,556 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include +#include + +#include +#include +#include + +// WellContributions are included via the solver +// MultisegmentWellContribution includes the cuda runtime if found by CMake +// this leads to inclusion of both amd_hip_vector_types.h and vector_types.h +// which both define vector types like uchar2, short3 and double4. +#undef HAVE_CUDA + +#include + +#include + +#include + +#define HIP_CHECK(stat) \ + { \ + if(stat != hipSuccess) \ + { \ + OPM_THROW(std::logic_error, "HIP error"); \ + } \ + } + +#define ROCSPARSE_CHECK(stat) \ + { \ + if(stat != rocsparse_status_success) \ + { \ + OPM_THROW(std::logic_error, "rocsparse error"); \ + } \ + } + +#define ROCBLAS_CHECK(stat) \ + { \ + if(stat != rocblas_status_success) \ + { \ + OPM_THROW(std::logic_error, "rocblas error"); \ + } \ + } + + +namespace Opm +{ +namespace Accelerator +{ + +using Opm::OpmLog; +using Dune::Timer; + +template +rocsparseSolverBackend::rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_) { + +} + + + +template +rocsparseSolverBackend::~rocsparseSolverBackend() { + try { + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipStreamDestroy(stream)); + ROCSPARSE_CHECK(rocsparse_destroy_handle(handle)); + ROCBLAS_CHECK(rocblas_destroy_handle(blas_handle)); + } catch (const std::logic_error& err) { + OpmLog::error(err.what()); + } +} + + +template +void rocsparseSolverBackend::gpu_pbicgstab([[maybe_unused]] WellContributions& wellContribs, + BdaResult& res) +{ + float it = 0.5; + double rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2; + double norm, norm_0; + double zero = 0.0; + double one = 1.0; + double mone = -1.0; + + Timer t_total, t_prec(false), t_spmv(false), t_rest(false); + + ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation, + Nb, Nb, nnzb, &one, descr_M, + d_Avals, d_Arows, d_Acols, block_size, + d_x, &zero, d_r)); + + ROCBLAS_CHECK(rocblas_dscal(blas_handle, N, &mone, d_r, 1)); + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &one, d_b, 1, d_r, 1)); + ROCBLAS_CHECK(rocblas_dcopy(blas_handle, N, d_r, 1, d_rw, 1)); + ROCBLAS_CHECK(rocblas_dcopy(blas_handle, N, d_r, 1, d_p, 1)); + ROCBLAS_CHECK(rocblas_dnrm2(blas_handle, N, d_r, 1, &norm_0)); + + if (verbosity >= 2) { + std::ostringstream out; + out << std::scientific << "rocsparseSolver initial norm: " << norm_0; + OpmLog::info(out.str()); + } + + if (verbosity >= 3) { + t_rest.start(); + } + for (it = 0.5; it < maxit; it += 0.5) { + rhop = rho; + ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_rw, 1, d_r, 1, &rho)); + + if (it > 1) { + beta = (rho / rhop) * (alpha / omega); + nomega = -omega; + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &nomega, d_v, 1, d_p, 1)); + ROCBLAS_CHECK(rocblas_dscal(blas_handle, N, &beta, d_p, 1)); + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &one, d_r, 1, d_p, 1)); + } + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + t_rest.stop(); + t_prec.start(); + } + + // apply ilu0 + ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \ + operation, Nb, nnzbs_prec, &one, \ + descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_p, d_t, rocsparse_solve_policy_auto, d_buffer)); + ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \ + operation, Nb, nnzbs_prec, &one, \ + descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_pw, rocsparse_solve_policy_auto, d_buffer)); + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + t_prec.stop(); + t_spmv.start(); + } + + // spmv + ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation, + Nb, Nb, nnzb, &one, descr_M, + d_Avals, d_Arows, d_Acols, block_size, + d_pw, &zero, d_v)); + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + t_spmv.stop(); + t_rest.start(); + } + + // apply wellContributions + + ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_rw, 1, d_v, 1, &tmp1)); + alpha = rho / tmp1; + nalpha = -alpha; + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &nalpha, d_v, 1, d_r, 1)); + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &alpha, d_pw, 1, d_x, 1)); + ROCBLAS_CHECK(rocblas_dnrm2(blas_handle, N, d_r, 1, &norm)); + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + t_rest.stop(); + } + + if (norm < tolerance * norm_0) { + break; + } + + it += 0.5; + + // apply ilu0 + if (verbosity >= 3) { + t_prec.start(); + } + ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \ + operation, Nb, nnzbs_prec, &one, \ + descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_r, d_t, rocsparse_solve_policy_auto, d_buffer)); + ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \ + operation, Nb, nnzbs_prec, &one, \ + descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_s, rocsparse_solve_policy_auto, d_buffer)); + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + t_prec.stop(); + t_spmv.start(); + } + + // spmv + ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation, + Nb, Nb, nnzb, &one, descr_M, + d_Avals, d_Arows, d_Acols, block_size, + d_s, &zero, d_t)); + if(verbosity >= 3){ + HIP_CHECK(hipStreamSynchronize(stream)); + t_spmv.stop(); + t_rest.start(); + } + + // apply wellContributions + + ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_t, 1, d_r, 1, &tmp1)); + ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_t, 1, d_t, 1, &tmp2)); + omega = tmp1 / tmp2; + nomega = -omega; + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &omega, d_s, 1, d_x, 1)); + ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &nomega, d_t, 1, d_r, 1)); + + ROCBLAS_CHECK(rocblas_dnrm2(blas_handle, N, d_r, 1, &norm)); + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + t_rest.stop(); + } + + if (norm < tolerance * norm_0) { + break; + } + + if (verbosity > 1) { + std::ostringstream out; + out << "it: " << it << std::scientific << ", norm: " << norm; + OpmLog::info(out.str()); + } + } + + res.iterations = std::min(it, (float)maxit); + res.reduction = norm / norm_0; + res.conv_rate = static_cast(pow(res.reduction, 1.0 / it)); + res.elapsed = t_total.stop(); + res.converged = (it != (maxit + 0.5)); + + if (verbosity >= 1) { + std::ostringstream out; + out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \ + ", time per iteration: " << res.elapsed / it << ", iterations: " << it; + OpmLog::info(out.str()); + } + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseSolver::prec_apply: " << t_prec.elapsed() << " s\n"; + out << "rocsparseSolver::spmv: " << t_spmv.elapsed() << " s\n"; + out << "rocsparseSolver::rest: " << t_rest.elapsed() << " s\n"; + out << "rocsparseSolver::total_solve: " << res.elapsed << " s\n"; + OpmLog::info(out.str()); + } +} + + +template +void rocsparseSolverBackend::initialize(std::shared_ptr matrix, std::shared_ptr jacMatrix) { + this->Nb = matrix->Nb; + this->N = Nb * block_size; + this->nnzb = matrix->nnzbs; + this->nnz = nnzb * block_size * block_size; + nnzbs_prec = nnzb; + + if (jacMatrix) { + useJacMatrix = true; + nnzbs_prec = jacMatrix->nnzbs; + } + + std::ostringstream out; + out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n"; + if (useJacMatrix) { + out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n"; + } + out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; + out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n"; + OpmLog::info(out.str()); + out.str(""); + out.clear(); + + mat = matrix; + jacMat = jacMatrix; + + HIP_CHECK(hipMalloc((void**)&d_r, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_p, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_s, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_v, sizeof(double) * N)); + + ROCSPARSE_CHECK(rocsparse_create_handle(&handle)); + ROCBLAS_CHECK(rocblas_create_handle(&blas_handle)); + + ROCSPARSE_CHECK(rocsparse_get_version(handle, &ver)); + ROCSPARSE_CHECK(rocsparse_get_git_rev(handle, rev)); + + out << "rocSPARSE version: " << ver / 100000 << "." << ver / 100 % 1000 << "." + << ver % 100 << "-" << rev << "\n"; + OpmLog::info(out.str()); + + HIP_CHECK(hipStreamCreate(&stream)); + ROCSPARSE_CHECK(rocsparse_set_stream(handle, stream)); + ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream)); + + HIP_CHECK(hipMalloc((void**)&d_Arows, sizeof(rocsparse_int) * (Nb + 1))); + HIP_CHECK(hipMalloc((void**)&d_Acols, sizeof(rocsparse_int) * nnzb)); + HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(double) * nnz)); + HIP_CHECK(hipMalloc((void**)&d_x, sizeof(double) * N)); + HIP_CHECK(hipMalloc((void**)&d_b, sizeof(double) * N)); + + if (useJacMatrix) { + HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1))); + HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec)); + HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size)); + } else { // preconditioner matrix is same + HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size)); + d_Mcols = d_Acols; + d_Mrows = d_Arows; + } + + initialized = true; +} // end initialize() + +template +void rocsparseSolverBackend::copy_system_to_gpu(double *b) { + Timer t; + + HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream)); + if (useJacMatrix) { + HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream)); + } else { + HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream)); + } + HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream)); + HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream)); + + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + std::ostringstream out; + out << "rocsparseSolver::copy_system_to_gpu(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } +} // end copy_system_to_gpu() + +// don't copy rowpointers and colindices, they stay the same +template +void rocsparseSolverBackend::update_system_on_gpu(double *b) { + Timer t; + + HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream)); + if (useJacMatrix) { + HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream)); + } else { + HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream)); + } + HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream)); + HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream)); + + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + std::ostringstream out; + out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } +} // end update_system_on_gpu() + +template +bool rocsparseSolverBackend::analyze_matrix() { + size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize; + Timer t; + + ROCSPARSE_CHECK(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info)); + + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M)); + + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L)); + ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower)); + ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit)); + + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U)); + ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper)); + ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit)); + + ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(handle, dir, Nb, nnzbs_prec, + descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M)); + ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec, + descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L)); + ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec, + descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U)); + + d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U)); + + HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize)); + + // analysis of ilu LU decomposition + ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(handle, dir, \ + Nb, nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \ + block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer)); + + int zero_position = 0; + rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position); + if (rocsparse_status_success != status) { + printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position); + return false; + } + + // analysis of ilu apply + ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \ + Nb, nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \ + block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer)); + ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \ + Nb, nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \ + block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer)); + + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + std::ostringstream out; + out << "rocsparseSolver::analyze_matrix(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } + + analysis_done = true; + + return true; +} // end analyze_matrix() + + +template +bool rocsparseSolverBackend::create_preconditioner() { + Timer t; + + bool result = true; + ROCSPARSE_CHECK(rocsparse_dbsrilu0(handle, dir, Nb, nnzbs_prec, descr_M, + d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer)); + + // Check for zero pivot + int zero_position = 0; + rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position); + if(rocsparse_status_success != status) + { + printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position); + return false; + } + + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + std::ostringstream out; + out << "rocsparseSolver::create_preconditioner(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } + return result; +} // end create_preconditioner() + + +template +void rocsparseSolverBackend::solve_system(WellContributions &wellContribs, BdaResult &res) { + Timer t; + + // actually solve + try { + gpu_pbicgstab(wellContribs, res); + } catch (const cl::Error& error) { + std::ostringstream oss; + oss << "rocsparseSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n"; + oss << getErrorString(error.err()); + // rethrow exception + OPM_THROW(std::logic_error, oss.str()); + } catch (const std::logic_error& error) { + // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs + throw error; + } + + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(stream)); + std::ostringstream out; + out << "rocsparseSolver::solve_system(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } + +} // end solve_system() + + +// copy result to host memory +// caller must be sure that x is a valid array +template +void rocsparseSolverBackend::get_result(double *x) { + Timer t; + + HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); // always wait, caller might want to use x immediately + + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseSolver::get_result(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } +} // end get_result() + + +template +SolverStatus rocsparseSolverBackend::solve_system(std::shared_ptr matrix, + double *b, + std::shared_ptr jacMatrix, + WellContributions& wellContribs, + BdaResult &res) +{ + if (initialized == false) { + initialize(matrix, jacMatrix); + copy_system_to_gpu(b); + if (analysis_done == false) { + if (!analyze_matrix()) { + return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; + } + } + if (!create_preconditioner()) { + return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + } + } else { + update_system_on_gpu(b); + if (!create_preconditioner()) { + return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + } + } + solve_system(wellContribs, res); + + return SolverStatus::BDA_SOLVER_SUCCESS; +} + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template rocsparseSolverBackend::rocsparseSolverBackend( \ + int, int, double, unsigned int, unsigned int); + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); +INSTANTIATE_BDA_FUNCTIONS(5); +INSTANTIATE_BDA_FUNCTIONS(6); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} // namespace Accelerator +} // namespace Opm diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp new file mode 100644 index 000000000..4563b671c --- /dev/null +++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp @@ -0,0 +1,153 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_ROCSPARSESOLVER_BACKEND_HEADER_INCLUDED +#define OPM_ROCSPARSESOLVER_BACKEND_HEADER_INCLUDED + +#include + +#include +#include +#include +#include + +#include +#include + +namespace Opm +{ +namespace Accelerator +{ + +/// This class implements a rocsparse-based ilu0-bicgstab solver on GPU +template +class rocsparseSolverBackend : public BdaSolver +{ + typedef BdaSolver Base; + + using Base::N; + using Base::Nb; + using Base::nnz; + using Base::nnzb; + using Base::verbosity; + using Base::platformID; + using Base::deviceID; + using Base::maxit; + using Base::tolerance; + using Base::initialized; + +private: + + bool useJacMatrix = false; + + bool analysis_done = false; + std::shared_ptr mat = nullptr; // original matrix + std::shared_ptr jacMat = nullptr; // matrix for preconditioner + int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M + + rocsparse_direction dir = rocsparse_direction_row; + rocsparse_operation operation = rocsparse_operation_none; + rocsparse_handle handle; + rocblas_handle blas_handle; + rocsparse_mat_descr descr_M, descr_L, descr_U; + rocsparse_mat_info ilu_info; + hipStream_t stream; + + rocsparse_int *d_Arows, *d_Mrows; + rocsparse_int *d_Acols, *d_Mcols; + double *d_Avals, *d_Mvals; + double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve + double *d_pw, *d_s, *d_t, *d_v; + void *d_buffer; // buffer space, used by rocsparse ilu0 analysis + int ver; + char rev[64]; + + + /// Solve linear system using ilu0-bicgstab + /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); + + /// Initialize GPU and allocate memory + /// \param[in] matrix matrix A + /// \param[in] jacMatrix matrix for preconditioner + void initialize(std::shared_ptr matrix, std::shared_ptr jacMatrix); + + /// Copy linear system to GPU + /// \param[in] b input vector, contains N values + void copy_system_to_gpu(double *b); + + /// Update linear system to GPU + /// \param[in] b input vector, contains N values + void update_system_on_gpu(double *b); + + /// Analyze sparsity pattern to extract parallelism + /// \return true iff analysis was successful + bool analyze_matrix(); + + /// Perform ilu0-decomposition + /// \return true iff decomposition was successful + bool create_preconditioner(); + + /// Solve linear system + /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + void solve_system(WellContributions &wellContribs, BdaResult &res); + +public: + /// Construct a openclSolver + /// \param[in] linear_solver_verbosity verbosity of openclSolver + /// \param[in] maxit maximum number of iterations for openclSolver + /// \param[in] tolerance required relative tolerance for openclSolver + /// \param[in] platformID the OpenCL platform to be used + /// \param[in] deviceID the device to be used + rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID); + + /// For the CPR coarse solver + // rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder); + + /// Destroy a openclSolver, and free memory + ~rocsparseSolverBackend(); + + /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format + /// \param[in] matrix matrix A + /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner + /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + /// \return status code + SolverStatus solve_system(std::shared_ptr matrix, double *b, + std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) override; + + /// Solve scalar linear system, for example a coarse system of an AMG preconditioner + /// Data is already on the GPU + // SolverStatus solve_system(BdaResult &res); + + /// Get result after linear solve, and peform postprocessing if necessary + /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array + void get_result(double *x) override; + +}; // end class rocsparseSolverBackend + +} // namespace Accelerator +} // namespace Opm + +#endif + +