Merge pull request #4406 from Tongdongq/rocsparse

Add rocsparseSolver
This commit is contained in:
Markus Blatt 2023-04-12 12:07:58 +02:00 committed by GitHub
commit 8142788b58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1057 additions and 6 deletions

View File

@ -240,12 +240,6 @@ if(OpenCL_FOUND)
endif()
endif()
find_package(rocalution)
if(ROCALUTION_FOUND)
set(HAVE_ROCALUTION 1)
set(COMPILE_BDA_BRIDGE 1)
endif()
macro (config_hook)
opm_need_version_of ("dune-common")
@ -278,6 +272,27 @@ macro (fortran_hook)
endmacro (fortran_hook)
macro (files_hook)
if(hip_FOUND)
get_filename_component(CXX_COMPILER ${CMAKE_CXX_COMPILER} NAME)
if(hip_VERSION VERSION_LESS "5.3")
if(ROCALUTION_FOUND AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
message(WARNING " Cannot use hipcc/clang for rocalution with rocm < 5.3\n Disabling rocalutionSolver")
unset(ROCALUTION_FOUND)
unset(HAVE_ROCALUTION)
endif()
else()
if(rocsparse_FOUND AND rocblas_FOUND)
set(HAVE_ROCSPARSE 1)
set(COMPILE_BDA_BRIDGE 1)
else()
unset(HAVE_ROCSPARSE)
endif()
endif()
if(ROCALUTION_FOUND)
set(HAVE_ROCALUTION 1)
set(COMPILE_BDA_BRIDGE 1)
endif()
endif()
if(MPI_FOUND AND HDF5_FOUND AND NOT HDF5_IS_PARALLEL)
message(WARNING "When building parallel OPM flow we need a "
"parallel version of hdf5, but found only a serial one. "
@ -531,6 +546,11 @@ if(ROCALUTION_FOUND)
target_include_directories(opmsimulators PUBLIC ${rocalution_INCLUDE_DIR}/rocalution)
endif()
if(rocsparse_FOUND AND rocblas_FOUND)
target_link_libraries( opmsimulators PUBLIC roc::rocsparse )
target_link_libraries( opmsimulators PUBLIC roc::rocblas )
endif()
if(VexCL_FOUND)
target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL )
endif()

View File

@ -161,6 +161,9 @@ endif()
if(ROCALUTION_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
endif()
if(rocsparse_FOUND AND rocblas_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
endif()
if(COMPILE_BDA_BRIDGE)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
endif()
@ -226,6 +229,9 @@ endif()
if(ROCALUTION_FOUND)
list(APPEND TEST_SOURCE_FILES tests/test_rocalutionSolver.cpp)
endif()
if(rocsparse_FOUND AND rocblas_FOUND)
list(APPEND TEST_SOURCE_FILES tests/test_rocsparseSolver.cpp)
endif()
if(HDF5_FOUND)
list(APPEND TEST_SOURCE_FILES tests/test_HDF5File.cpp)
list(APPEND TEST_SOURCE_FILES tests/test_HDF5Serializer.cpp)
@ -337,6 +343,7 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/Matrix.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/amgcpr.hh
opm/simulators/linalg/twolevelmethodcpr.hh

View File

@ -12,6 +12,7 @@ set (opm-simulators_CONFIG_VAR
HAVE_AMGCL
HAVE_VEXCL
HAVE_ROCALUTION
HAVE_ROCSPARSE
HAVE_SUITESPARSE_UMFPACK_H
HAVE_DUNE_ISTL
DUNE_ISTL_WITH_CHECKING
@ -44,6 +45,9 @@ set (opm-simulators_DEPS
"SuperLU"
# ROCALUTION from ROCM framework
"rocalution"
# packages from ROCm framework
"rocblas"
"rocsparse"
# OPM dependency
"opm-common REQUIRED"
"opm-grid REQUIRED"

View File

@ -45,6 +45,10 @@
#include <opm/simulators/linalg/bda/rocalutionSolverBackend.hpp>
#endif
#if HAVE_ROCSPARSE
#include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
#endif
typedef Dune::InverseOperatorResult InverseOperatorResult;
namespace Opm
@ -91,6 +95,13 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
backend.reset(new Opm::Accelerator::rocalutionSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
#else
OPM_THROW(std::logic_error, "Error rocalutionSolver was chosen, but rocalution was not found by CMake");
#endif
} else if (accelerator_mode.compare("rocsparse") == 0) {
#if HAVE_ROCSPARSE
use_gpu = true; // should be replaced by a 'use_bridge' boolean
backend.reset(new Opm::Accelerator::rocsparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
#else
OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
#endif
} else if (accelerator_mode.compare("none") == 0) {
use_gpu = false;

View File

@ -52,6 +52,12 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
#endif
}
else if(accelerator_mode.compare("rocsparse") == 0){
if (!useWellConn) {
OPM_THROW(std::logic_error, "Error rocsparse requires --matrix-add-well-contributions=true");
}
return std::make_unique<WellContributions>();
}
else if(accelerator_mode.compare("amgcl") == 0){
if (!useWellConn) {
OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true");

View File

@ -26,11 +26,27 @@
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
// WellContributions are included via the solver
// MultisegmentWellContribution includes the cuda runtime if found by CMake
// this leads to inclusion of both amd_hip_vector_types.h and vector_types.h
// which both define vector types like uchar2, short3 and double4.
// Restore the value (if defined) afterwards.
#ifdef HAVE_CUDA
#define HIP_HAVE_CUDA_DEFINED HAVE_CUDA
#endif
#undef HAVE_CUDA
#include <opm/simulators/linalg/bda/rocalutionSolverBackend.hpp>
#include <rocalution.hpp>
#include <base/matrix_formats_ind.hpp> // check if blocks are interpreted as row-major or column-major
#ifdef HIP_HAVE_CUDA_DEFINED
#define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
#undef HIP_HAVE_CUDA_DEFINED
#endif
namespace Opm
{
namespace Accelerator

View File

@ -0,0 +1,612 @@
/*
Copyright 2023 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <sstream>
#include <stdexcept>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
// WellContributions are included via the solver
// MultisegmentWellContribution includes the cuda runtime if found by CMake
// this leads to inclusion of both amd_hip_vector_types.h and vector_types.h
// which both define vector types like uchar2, short3 and double4.
// Restore the value (if defined) afterwards.
#ifdef HAVE_CUDA
#define HIP_HAVE_CUDA_DEFINED HAVE_CUDA
#endif
#undef HAVE_CUDA
#include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
#ifdef HIP_HAVE_CUDA_DEFINED
#define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
#undef HIP_HAVE_CUDA_DEFINED
#endif
#define HIP_CHECK(stat) \
{ \
if(stat != hipSuccess) \
{ \
OPM_THROW(std::logic_error, "HIP error"); \
} \
}
#define ROCSPARSE_CHECK(stat) \
{ \
if(stat != rocsparse_status_success) \
{ \
OPM_THROW(std::logic_error, "rocsparse error"); \
} \
}
#define ROCBLAS_CHECK(stat) \
{ \
if(stat != rocblas_status_success) \
{ \
OPM_THROW(std::logic_error, "rocblas error"); \
} \
}
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
hipDevice_t device;
if(hipDeviceGet(&device, deviceID) != hipSuccess)
{
OPM_THROW(std::logic_error, "HIP Error: could not get device");
}
ROCSPARSE_CHECK(rocsparse_create_handle(&handle));
ROCBLAS_CHECK(rocblas_create_handle(&blas_handle));
ROCSPARSE_CHECK(rocsparse_get_version(handle, &ver));
ROCSPARSE_CHECK(rocsparse_get_git_rev(handle, rev));
std::ostringstream out;
out << "rocSPARSE version: " << ver / 100000 << "." << ver / 100 % 1000 << "."
<< ver % 100 << "-" << rev << "\n";
OpmLog::info(out.str());
HIP_CHECK(hipStreamCreate(&stream));
ROCSPARSE_CHECK(rocsparse_set_stream(handle, stream));
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
}
template <unsigned int block_size>
rocsparseSolverBackend<block_size>::~rocsparseSolverBackend() {
hipError_t hipstatus = hipStreamSynchronize(stream);
if(hipstatus != hipSuccess){
OpmLog::error("Could not synchronize with hipStream");
}
hipstatus = hipStreamDestroy(stream);
if(hipstatus != hipSuccess){
OpmLog::error("Could not destroy hipStream");
}
rocsparse_status status1 = rocsparse_destroy_handle(handle);
if(status1 != rocsparse_status_success){
OpmLog::error("Could not destroy rocsparse handle");
}
rocblas_status status2 = rocblas_destroy_handle(blas_handle);
if(status2 != rocblas_status_success){
OpmLog::error("Could not destroy rocblas handle");
}
}
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellContributions& wellContribs,
BdaResult& res)
{
float it = 0.5;
double rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
double norm, norm_0;
double zero = 0.0;
double one = 1.0;
double mone = -1.0;
Timer t_total, t_prec(false), t_spmv(false), t_rest(false);
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_x, &zero, d_r));
#else
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
d_Avals, d_Arows, d_Acols, block_size,
d_x, &zero, d_r));
#endif
ROCBLAS_CHECK(rocblas_dscal(blas_handle, N, &mone, d_r, 1));
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &one, d_b, 1, d_r, 1));
ROCBLAS_CHECK(rocblas_dcopy(blas_handle, N, d_r, 1, d_rw, 1));
ROCBLAS_CHECK(rocblas_dcopy(blas_handle, N, d_r, 1, d_p, 1));
ROCBLAS_CHECK(rocblas_dnrm2(blas_handle, N, d_r, 1, &norm_0));
if (verbosity >= 2) {
std::ostringstream out;
out << std::scientific << "rocsparseSolver initial norm: " << norm_0;
OpmLog::info(out.str());
}
if (verbosity >= 3) {
t_rest.start();
}
for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho;
ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_rw, 1, d_r, 1, &rho));
if (it > 1) {
beta = (rho / rhop) * (alpha / omega);
nomega = -omega;
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &nomega, d_v, 1, d_p, 1));
ROCBLAS_CHECK(rocblas_dscal(blas_handle, N, &beta, d_p, 1));
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &one, d_r, 1, d_p, 1));
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_rest.stop();
t_prec.start();
}
// apply ilu0
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_p, d_t, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_pw, rocsparse_solve_policy_auto, d_buffer));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_prec.stop();
t_spmv.start();
}
// spmv
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_pw, &zero, d_v));
#else
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
d_Avals, d_Arows, d_Acols, block_size,
d_pw, &zero, d_v));
#endif
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_spmv.stop();
t_rest.start();
}
// apply wellContributions
ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_rw, 1, d_v, 1, &tmp1));
alpha = rho / tmp1;
nalpha = -alpha;
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &nalpha, d_v, 1, d_r, 1));
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &alpha, d_pw, 1, d_x, 1));
ROCBLAS_CHECK(rocblas_dnrm2(blas_handle, N, d_r, 1, &norm));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_rest.stop();
}
if (norm < tolerance * norm_0) {
break;
}
it += 0.5;
// apply ilu0
if (verbosity >= 3) {
t_prec.start();
}
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_r, d_t, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_s, rocsparse_solve_policy_auto, d_buffer));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_prec.stop();
t_spmv.start();
}
// spmv
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_s, &zero, d_t));
#else
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
d_Avals, d_Arows, d_Acols, block_size,
d_s, &zero, d_t));
#endif
if(verbosity >= 3){
HIP_CHECK(hipStreamSynchronize(stream));
t_spmv.stop();
t_rest.start();
}
// apply wellContributions
ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_t, 1, d_r, 1, &tmp1));
ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_t, 1, d_t, 1, &tmp2));
omega = tmp1 / tmp2;
nomega = -omega;
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &omega, d_s, 1, d_x, 1));
ROCBLAS_CHECK(rocblas_daxpy(blas_handle, N, &nomega, d_t, 1, d_r, 1));
ROCBLAS_CHECK(rocblas_dnrm2(blas_handle, N, d_r, 1, &norm));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_rest.stop();
}
if (norm < tolerance * norm_0) {
break;
}
if (verbosity > 1) {
std::ostringstream out;
out << "it: " << it << std::scientific << ", norm: " << norm;
OpmLog::info(out.str());
}
}
res.iterations = std::min(it, (float)maxit);
res.reduction = norm / norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
res.elapsed = t_total.stop();
res.converged = (it != (maxit + 0.5));
if (verbosity >= 1) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str());
}
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseSolver::prec_apply: " << t_prec.elapsed() << " s\n";
out << "rocsparseSolver::spmv: " << t_spmv.elapsed() << " s\n";
out << "rocsparseSolver::rest: " << t_rest.elapsed() << " s\n";
out << "rocsparseSolver::total_solve: " << res.elapsed << " s\n";
OpmLog::info(out.str());
}
}
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
this->nnz = nnzb * block_size * block_size;
nnzbs_prec = nnzb;
if (jacMatrix) {
useJacMatrix = true;
nnzbs_prec = jacMatrix->nnzbs;
}
std::ostringstream out;
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
if (useJacMatrix) {
out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
}
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
OpmLog::info(out.str());
out.str("");
out.clear();
mat = matrix;
jacMat = jacMatrix;
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_p, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_s, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_v, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_Arows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Acols, sizeof(rocsparse_int) * nnzb));
HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(double) * nnz));
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(double) * N));
if (useJacMatrix) {
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
} else { // preconditioner matrix is same
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
d_Mcols = d_Acols;
d_Mrows = d_Arows;
}
initialized = true;
} // end initialize()
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
Timer t;
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
}
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
out << "rocsparseSolver::copy_system_to_gpu(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
Timer t;
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
}
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end update_system_on_gpu()
template <unsigned int block_size>
bool rocsparseSolverBackend<block_size>::analyze_matrix() {
size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
ROCSPARSE_CHECK(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host));
ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
#endif
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_A));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(handle, dir, Nb, nnzbs_prec,
descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
// analysis of ilu LU decomposition
ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(handle, dir, \
Nb, nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
int zero_position = 0;
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
if (rocsparse_status_success != status) {
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
return false;
}
// analysis of ilu apply
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
Nb, nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
Nb, nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex_analysis(handle, dir, operation,
Nb, Nb, nnzb,
descr_A, d_Avals, d_Arows, d_Acols,
block_size, spmv_info));
#endif
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
out << "rocsparseSolver::analyze_matrix(): " << t.stop() << " s";
OpmLog::info(out.str());
}
analysis_done = true;
return true;
} // end analyze_matrix()
template <unsigned int block_size>
bool rocsparseSolverBackend<block_size>::create_preconditioner() {
Timer t;
bool result = true;
ROCSPARSE_CHECK(rocsparse_dbsrilu0(handle, dir, Nb, nnzbs_prec, descr_M,
d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
// Check for zero pivot
int zero_position = 0;
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
if(rocsparse_status_success != status)
{
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
return false;
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
out << "rocsparseSolver::create_preconditioner(): " << t.stop() << " s";
OpmLog::info(out.str());
}
return result;
} // end create_preconditioner()
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
Timer t;
// actually solve
try {
gpu_pbicgstab(wellContribs, res);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "rocsparseSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
oss << getErrorString(error.err());
// rethrow exception
OPM_THROW(std::logic_error, oss.str());
} catch (const std::logic_error& error) {
// rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
throw error;
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
out << "rocsparseSolver::solve_system(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::get_result(double *x) {
Timer t;
HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipStreamSynchronize(stream)); // always wait, caller might want to use x immediately
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseSolver::get_result(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end get_result()
template <unsigned int block_size>
SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions& wellContribs,
BdaResult &res)
{
if (initialized == false) {
initialize(matrix, jacMatrix);
copy_system_to_gpu(b);
if (analysis_done == false) {
if (!analyze_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
} else {
update_system_on_gpu(b);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
}
solve_system(wellContribs, res);
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template rocsparseSolverBackend<n>::rocsparseSolverBackend( \
int, int, double, unsigned int, unsigned int);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm

View File

@ -0,0 +1,158 @@
/*
Copyright 2023 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ROCSPARSESOLVER_BACKEND_HEADER_INCLUDED
#define OPM_ROCSPARSESOLVER_BACKEND_HEADER_INCLUDED
#include <memory>
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <rocblas/rocblas.h>
#include <rocsparse/rocsparse.h>
#include <hip/hip_version.h>
namespace Opm
{
namespace Accelerator
{
/// This class implements a rocsparse-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class rocsparseSolverBackend : public BdaSolver<block_size>
{
typedef BdaSolver<block_size> Base;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::platformID;
using Base::deviceID;
using Base::maxit;
using Base::tolerance;
using Base::initialized;
private:
bool useJacMatrix = false;
bool analysis_done = false;
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
rocsparse_direction dir = rocsparse_direction_row;
rocsparse_operation operation = rocsparse_operation_none;
rocsparse_handle handle;
rocblas_handle blas_handle;
rocsparse_mat_descr descr_A, descr_M, descr_L, descr_U;
rocsparse_mat_info ilu_info;
#if HIP_VERSION >= 50400000
rocsparse_mat_info spmv_info;
#endif
hipStream_t stream;
rocsparse_int *d_Arows, *d_Mrows;
rocsparse_int *d_Acols, *d_Mcols;
double *d_Avals, *d_Mvals;
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
double *d_pw, *d_s, *d_t, *d_v;
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
int ver;
char rev[64];
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
/// Copy linear system to GPU
/// \param[in] b input vector, contains N values
void copy_system_to_gpu(double *b);
/// Update linear system to GPU
/// \param[in] b input vector, contains N values
void update_system_on_gpu(double *b);
/// Analyze sparsity pattern to extract parallelism
/// \return true iff analysis was successful
bool analyze_matrix();
/// Perform ilu0-decomposition
/// \return true iff decomposition was successful
bool create_preconditioner();
/// Solve linear system
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(WellContributions &wellContribs, BdaResult &res);
public:
/// Construct a openclSolver
/// \param[in] linear_solver_verbosity verbosity of openclSolver
/// \param[in] maxit maximum number of iterations for openclSolver
/// \param[in] tolerance required relative tolerance for openclSolver
/// \param[in] platformID the OpenCL platform to be used
/// \param[in] deviceID the device to be used
rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
/// For the CPR coarse solver
// rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
/// Destroy a openclSolver, and free memory
~rocsparseSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] matrix matrix A
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
/// Data is already on the GPU
// SolverStatus solve_system(BdaResult &res);
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
}; // end class rocsparseSolverBackend
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -0,0 +1,217 @@
/*
Copyright 2019 SINTEF Digital, Mathematics and Cybernetics.
Copyright 2023 Equinor
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <stdexcept>
#include <fstream>
#include <memory>
#define BOOST_TEST_MODULE OPM_test_rocsparseSolver
#include <boost/test/unit_test.hpp>
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <dune/common/fvector.hh>
#include <dune/istl/bvector.hh>
#include <dune/istl/bcrsmatrix.hh>
#include <dune/istl/matrixmarket.hh>
#include <dune/istl/solvers.hh>
#include <dune/istl/preconditioners.hh>
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ptree.hpp>
class HIPInitException : public std::logic_error
{
public:
HIPInitException(std::string msg) : logic_error(msg){};
};
template <int bz>
using Matrix = Dune::BCRSMatrix<Dune::FieldMatrix<double, bz, bz>>;
template <int bz>
using Vector = Dune::BlockVector<Dune::FieldVector<double, bz>>;
template <int bz>
void readLinearSystem(const std::string& matrix_filename, const std::string& rhs_filename, Matrix<bz>& matrix, Vector<bz>& rhs)
{
{
std::ifstream mfile(matrix_filename);
if (!mfile) {
throw std::runtime_error("Could not read matrix file");
}
readMatrixMarket(matrix, mfile);
}
{
std::ifstream rhsfile(rhs_filename);
if (!rhsfile) {
throw std::runtime_error("Could not read rhs file");
}
readMatrixMarket(rhs, rhsfile);
}
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
getDuneSolution(Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
typedef Dune::MatrixAdapter<Matrix<bz>,Vector<bz>,Vector<bz> > Operator;
Operator fop(matrix);
double relaxation = 0.9;
Dune::SeqILU<Matrix<bz>,Vector<bz>,Vector<bz> > prec(matrix, relaxation);
double reduction = 1e-2;
int maxit = 10;
int verbosity = 0;
Dune::BiCGSTABSolver<Vector<bz> > solver(fop, prec, reduction, maxit, verbosity);
solver.apply(x, rhs, result);
return x;
}
template <int bz>
void
createBridge(const boost::property_tree::ptree& prm, std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge)
{
const int linear_solver_verbosity = prm.get<int>("verbosity");
const int maxit = prm.get<int>("maxiter");
const double tolerance = prm.get<double>("tol");
const bool opencl_ilu_parallel(true);
const int platformID = 0;
const int deviceID = 0;
const std::string accelerator_mode("rocsparse");
const std::string linsolver("ilu0");
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
linear_solver_verbosity,
maxit,
tolerance,
platformID,
deviceID,
opencl_ilu_parallel,
linsolver);
} catch (const std::logic_error& error) {
BOOST_WARN_MESSAGE(true, error.what());
if (strstr(error.what(), "HIP Error: could not get device") != nullptr)
throw HIPInitException(error.what());
else
throw error;
}
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testRocsparseSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("rocsparse", true);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
bridge->get_result(x);
return x;
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testRocsparseSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("rocsparse", true);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
// the sparsity pattern and values are actually the same
bridge->solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
bridge->get_result(x);
return x;
}
namespace pt = boost::property_tree;
void test3(const pt::ptree& prm)
{
const int bz = 3;
Matrix<bz> matrix;
Vector<bz> rhs;
readLinearSystem("matr33.txt", "rhs3.txt", matrix, rhs);
Vector<bz> rhs2 = rhs; // deep copy, getDuneSolution() changes values in rhs vector
auto duneSolution = getDuneSolution<bz>(matrix, rhs);
// create bridge twice, because rocsparseSolver allocates memory for
// the jacobi matrix if passed, during the first solve_system() call
// if not present, no memory is allocated, and subsequent calls
// with a jacobi matrix will cause nans
{
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
createBridge(prm, bridge); // create bridge with rocsparseSolver
// test rocsparseSolver without Jacobi matrix
auto sol = testRocsparseSolver<bz>(bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size());
for (size_t i = 0; i < sol.size(); ++i) {
for (int row = 0; row < bz; ++row) {
BOOST_CHECK_CLOSE(sol[i][row], duneSolution[i][row], 1e-3);
}
}
}
{
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
createBridge(prm, bridge); // create bridge with rocsparseSolver
// test rocsparseSolver with Jacobi matrix
auto solJacobi = testRocsparseSolverJacobi<bz>(bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(solJacobi.size(), duneSolution.size());
for (size_t i = 0; i < solJacobi.size(); ++i) {
for (int row = 0; row < bz; ++row) {
BOOST_CHECK_CLOSE(solJacobi[i][row], duneSolution[i][row], 1e-3);
}
}
}
}
BOOST_AUTO_TEST_CASE(TestRocsparseSolver)
{
pt::ptree prm;
// Read parameters.
{
std::ifstream file("options_flexiblesolver.json");
pt::read_json(file, prm);
}
try {
// Test with 3x3 block solvers.
test3(prm);
} catch(const HIPInitException& ) {
BOOST_WARN_MESSAGE(true, "Problem with initializing HIP, skipping test");
}
}