refactor rocsparseSolverBackend to allow flexible preconditioner

This commit is contained in:
Razvan Nane 2024-06-04 12:17:49 +02:00
parent 071f009bf3
commit 3eb87b7b04
8 changed files with 650 additions and 157 deletions

View File

@ -264,6 +264,8 @@ if(USE_BDA_BRIDGE)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp)
endif()
if(rocsparse_FOUND AND rocblas_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp)
endif()
@ -668,6 +670,8 @@ if (USE_BDA_BRIDGE)
opm/simulators/linalg/bda/Matrix.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
opm/simulators/linalg/bda/WellContributions.hpp

View File

@ -115,7 +115,7 @@ BdaBridge(std::string accelerator_mode_,
use_gpu = true; // should be replaced by a 'use_bridge' boolean
using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
tolerance, platformID, deviceID);
tolerance, platformID, deviceID, linsolver);
#else
OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
#endif

View File

@ -0,0 +1,317 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <algorithm>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
#include <sstream>
#include <iostream> //Razvan
#include <hip/hip_runtime_api.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseBILU0::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCSPARSE_CHECK(STAT) \
do { \
const rocsparse_status stat = (STAT); \
if(stat != rocsparse_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseBILU0::rocsparse "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCBLAS_CHECK(STAT) \
do { \
const rocblas_status stat = (STAT); \
if(stat != rocblas_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseBILU0::rocblas "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
extern std::shared_ptr<std::thread> copyThread;
#endif //HAVE_OPENMP
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <class Scalar, unsigned int block_size>
rocsparseBILU0<Scalar, block_size>::
rocsparseBILU0(int verbosity_) :
Base(verbosity_)
{
}
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
this->nnz = nnzb * block_size * block_size;
this->nnzbs_prec = this->nnzb;
if (jacMatrix) {
this->useJacMatrix = true;
this->nnzbs_prec = jacMatrix->nnzbs;
this->jacMat = jacMatrix;
}
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * this->N));
if (this->useJacMatrix) {
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * this->nnzbs_prec));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * this->nnzbs_prec * block_size * block_size));
} else { // preconditioner matrix is same
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * this->nnzbs_prec * block_size * block_size));
d_Mcols = d_Acols;
d_Mrows = d_Arows;
}
return true;
} // end initialize()
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
analyze_matrix(BlockedMatrix<Scalar> *mat) {
return analyze_matrix(mat, &(*this->jacMat));
}
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) {
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
#endif
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(this->handle, this->dir, Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(this->handle, this->dir, this->operation, Nb, this->nnzbs_prec,
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(this->handle, this->dir, this->operation, Nb, this->nnzbs_prec,
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
// analysis of ilu LU decomposition
ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(this->handle, this->dir, \
Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
int zero_position = 0;
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(this->handle, ilu_info, &zero_position);
if (rocsparse_status_success != status) {
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
return false;
}
// analysis of ilu apply
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(this->handle, this->dir, this->operation, \
Nb, this->nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(this->handle, this->dir, this->operation, \
Nb, this->nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(this->stream));
std::ostringstream out;
out << "rocsparseBILU0::analyze_matrix(): " << t.stop() << " s";
OpmLog::info(out.str());
}
return true;
}
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
create_preconditioner(BlockedMatrix<Scalar> *mat) {
return create_preconditioner(mat, &*this->jacMat);
}
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) {
Timer t;
bool result = true;
ROCSPARSE_CHECK(rocsparse_dbsrilu0(this->handle, this->dir, Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
// Check for zero pivot
int zero_position = 0;
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(this->handle, ilu_info, &zero_position);
if(rocsparse_status_success != status)
{
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
return false;
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(this->stream));
std::ostringstream out;
out << "rocsparseBILU0::create_preconditioner(): " << t.stop() << " s";
OpmLog::info(out.str());
}
return result;
} // end create_preconditioner()
template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
copy_system_to_gpu(double *d_Avals) {
Timer t;
if (this->useJacMatrix) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(double) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, this->stream));
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(this->stream));
std::ostringstream out;
out << "rocsparseBILU0::copy_system_to_gpu(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
update_system_on_gpu(double *d_Avals) {
Timer t;
if (this->useJacMatrix) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(double) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, this->stream));
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(this->stream));
std::ostringstream out;
out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end update_system_on_gpu()
template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
apply(double& y, double& x) {
double zero = 0.0;
double one = 1.0;
Timer t_apply;
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(this->handle, this->dir, \
this->operation, Nb, this->nnzbs_prec, &one, \
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &y, d_t, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(this->handle, this->dir, \
this->operation, Nb, this->nnzbs_prec, &one, \
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, &x, rocsparse_solve_policy_auto, d_buffer));
if (verbosity >= 3) {
std::ostringstream out;
HIP_CHECK(hipStreamSynchronize(this->stream));
out << "rocsparseBILU0 apply: " << t_apply.stop() << " s";
OpmLog::info(out.str());
}
}
#define INSTANCE_TYPE(T) \
template class rocsparseBILU0<T,1>; \
template class rocsparseBILU0<T,2>; \
template class rocsparseBILU0<T,3>; \
template class rocsparseBILU0<T,4>; \
template class rocsparseBILU0<T,5>; \
template class rocsparseBILU0<T,6>;
INSTANCE_TYPE(double)
// #define INSTANTIATE_BDA_FUNCTIONS(n) \
// template class rocsparseBILU0<n>;
//
// INSTANTIATE_BDA_FUNCTIONS(1);
// INSTANTIATE_BDA_FUNCTIONS(2);
// INSTANTIATE_BDA_FUNCTIONS(3);
// INSTANTIATE_BDA_FUNCTIONS(4);
// INSTANTIATE_BDA_FUNCTIONS(5);
// INSTANTIATE_BDA_FUNCTIONS(6);
//
// #undef INSTANTIATE_BDA_FUNCTIONS
//
// } // namespace Accelerator
} // namespace Opm

View File

@ -0,0 +1,97 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ROCSPARSEBILU0_HPP
#define OPM_ROCSPARSEBILU0_HPP
#include <mutex>
#include <vector>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
#include <rocblas/rocblas.h>
#include <rocsparse/rocsparse.h>
#include <hip/hip_version.h>
namespace Opm::Accelerator {
/// This class implements a Blocked ILU0 preconditioner
/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
/// The preconditioner is applied via two exact triangular solves
template <class Scalar, unsigned int block_size>
class rocsparseBILU0 : public rocsparsePreconditioner<Scalar, block_size>
{
typedef rocsparsePreconditioner<Scalar, block_size> Base;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
private:
rocsparse_mat_descr descr_M, descr_L, descr_U;
rocsparse_mat_info ilu_info;
#if HIP_VERSION >= 50400000
rocsparse_mat_info spmv_info;
#endif
rocsparse_int *d_Mrows, *d_Mcols;
double *d_Mvals, *d_t;
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
public:
rocsparseBILU0(int verbosity_);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
// analysis, extract parallelism if specified
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
// ilu_decomposition
bool create_preconditioner(BlockedMatrix<Scalar> *mat) override;
bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) override;
// apply preconditioner, x = prec(y)
// via Lz = y
// and Ux = z
void apply(double& y, double& x) override;
#if HAVE_OPENCL
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) {}
#endif
void copy_system_to_gpu(double *mVals);
void update_system(double *vals, double *b);
void update_system_on_gpu(double *b);
};
} // namespace Opm
#endif

View File

@ -0,0 +1,76 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <memory>
#include <opm/common/TimingMacros.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
// #include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
namespace Opm::Accelerator {
template <class Scalar, unsigned int block_size>
std::unique_ptr<rocsparsePreconditioner<Scalar,block_size> > rocsparsePreconditioner<Scalar,block_size>::
create(PreconditionerType type, int verbosity) {
if (type == PreconditionerType::BILU0) {
return std::make_unique<Opm::Accelerator::rocsparseBILU0<Scalar, block_size> >(verbosity);
// } else if (type == PreconditionerType::CPR) {
// return std::make_unique<Opm::Accelerator::rocsparseCPR<block_size> >(verbosity);
} else {
OPM_THROW(std::logic_error, "Invalid PreconditionerType");
}
}
template <class Scalar, unsigned int block_size>
void rocsparsePreconditioner<Scalar, block_size>::
set_matrix_analysis(rocsparse_mat_descr descr_L, rocsparse_mat_descr descr_U) {
descr_L = descr_L;
descr_U = descr_U;
}
template <class Scalar, unsigned int block_size>
void rocsparsePreconditioner<Scalar, block_size>::
set_context(rocsparse_handle handle, rocsparse_direction dir, rocsparse_operation operation, hipStream_t stream) {
this->handle = handle;
this->dir = dir;
this->operation = operation;
this->stream = stream;
}
template <class Scalar, unsigned int block_size>
void rocsparsePreconditioner<Scalar, block_size>::
setJacMat(BlockedMatrix<Scalar> jacMat) {
this->jacMat = std::make_shared<BlockedMatrix<Scalar>>(jacMat);
}
#define INSTANTIATE_TYPE(T) \
template class rocsparsePreconditioner<T,1>; \
template class rocsparsePreconditioner<T,2>; \
template class rocsparsePreconditioner<T,3>; \
template class rocsparsePreconditioner<T,4>; \
template class rocsparsePreconditioner<T,5>; \
template class rocsparsePreconditioner<T,6>;
INSTANTIATE_TYPE(double)
} //namespace Opm

View File

@ -0,0 +1,79 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ROCSPARSEPRECONDITIONER_HEADER_INCLUDED
#define OPM_ROCSPARSEPRECONDITIONER_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
#include <rocsparse/rocsparse.h>
namespace Opm::Accelerator {
template<class Scalar> class BlockedMatrix;
template <class Scalar, unsigned int block_size>
class rocsparsePreconditioner : public Preconditioner<Scalar, block_size>
{
protected:
rocsparse_handle handle;
rocsparse_direction dir = rocsparse_direction_row;
rocsparse_operation operation = rocsparse_operation_none;
rocsparse_mat_descr descr_L, descr_U;
hipStream_t stream;
rocsparsePreconditioner(int verbosity_) :
Preconditioner<Scalar, block_size>(verbosity_)
{};
public:
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
bool useJacMatrix = false;
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
virtual ~rocsparsePreconditioner() = default;
static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type, int verbosity);
// apply preconditioner, x = prec(y)
virtual void apply(double& y, double& x) = 0;
// create/update preconditioner, probably used every linear solve
// the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) = 0;
virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) = 0;
virtual void copy_system_to_gpu(double *b)=0;
/// Update linear system to GPU
/// \param[in] b input vector, contains N values
virtual void update_system_on_gpu(double *b)=0;
void set_matrix_analysis(rocsparse_mat_descr descr_L, rocsparse_mat_descr descr_U);
void set_context(rocsparse_handle handle, rocsparse_direction dir, rocsparse_operation operation, hipStream_t stream);
void setJacMat(BlockedMatrix<Scalar> jacMat);
};
} //namespace Opm
#endif

View File

@ -41,6 +41,8 @@
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
@ -87,12 +89,6 @@
#include <cstddef>
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
extern std::shared_ptr<std::thread> copyThread;
#endif //HAVE_OPENMP
namespace Opm::Accelerator {
using Dune::Timer;
@ -100,10 +96,26 @@ using Dune::Timer;
template<class Scalar, unsigned int block_size>
rocsparseSolverBackend<Scalar,block_size>::
rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
unsigned int platformID_, unsigned int deviceID_)
unsigned int platformID_, unsigned int deviceID_, std::string linsolver)
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
{
int numDevices = 0;
bool use_cpr, use_isai;
if (linsolver.compare("ilu0") == 0) {
use_cpr = false;
use_isai = false;
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
use_cpr = true;
use_isai = false;
} else if (linsolver.compare("isai") == 0) {
OPM_THROW(std::logic_error, "Error rocsparseSolver does not support --linerar-solver=isai");
} else if (linsolver.compare("cpr_trueimpes") == 0) {
OPM_THROW(std::logic_error, "Error rocsparseSolver does not support --linerar-solver=cpr_trueimpes");
} else {
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
}
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (static_cast<int>(deviceID) >= numDevices) {
OPM_THROW(std::runtime_error, "Invalid HIP device ID");
@ -124,6 +136,15 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
HIP_CHECK(hipStreamCreate(&stream));
ROCSPARSE_CHECK(rocsparse_set_stream(handle, stream));
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
// if (use_cpr) {
// prec = rocsparsePreconditioner<block_size>::create(PreconditionerType::CPR, verbosity);
// } else {
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity);
// }
prec->set_context(handle, dir, operation, stream);
}
template<class Scalar, unsigned int block_size>
@ -170,17 +191,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
#if HIP_VERSION >= 60000000
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_x, &zero, d_r));
#elif HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_x, &zero, d_r));
#else
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
d_x, &zero, d_r));
#endif
@ -216,13 +237,9 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
t_prec.start();
}
// apply ilu0
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_p, d_t, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_pw, rocsparse_solve_policy_auto, d_buffer));
// apply ilu0
prec->apply(*d_p, *d_pw);
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_prec.stop();
@ -232,17 +249,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
// spmv
#if HIP_VERSION >= 60000000
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_pw, &zero, d_v));
#elif HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_pw, &zero, d_v));
#else
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
d_pw, &zero, d_v));
#endif
@ -283,12 +300,9 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
if (verbosity >= 3) {
t_prec.start();
}
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_r, d_t, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
operation, Nb, nnzbs_prec, &one, \
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_s, rocsparse_solve_policy_auto, d_buffer));
prec->apply(*d_r, *d_s);
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_prec.stop();
@ -298,17 +312,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
// spmv
#if HIP_VERSION >= 60000000
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_s, &zero, d_t));
#elif HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
spmv_info, d_s, &zero, d_t));
#else
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
Nb, Nb, nnzb, &one, descr_M,
Nb, Nb, nnzb, &one, descr_A,
d_Avals, d_Arows, d_Acols, block_size,
d_s, &zero, d_t));
#endif
@ -384,14 +398,18 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->N = this->Nb * block_size;
this->nnzb = matrix->nnzbs;
this->nnz = nnzb * block_size * block_size;
nnzbs_prec = nnzb;
this->nnz = this->nnzb * block_size * block_size;
if (jacMatrix) {
useJacMatrix = true;
nnzbs_prec = jacMatrix->nnzbs;
prec->useJacMatrix = true;
prec->jacMat = jacMatrix;
prec->nnzbs_prec = jacMatrix->nnzbs;
} else {
prec->useJacMatrix = false;
prec->jacMat = matrix;
prec->nnzbs_prec = this->nnzb;
}
std::ostringstream out;
@ -408,7 +426,6 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
out.clear();
mat = matrix;
jacMat = jacMatrix;
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
@ -424,15 +441,7 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));
if (useJacMatrix) {
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
} else { // preconditioner matrix is same
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
d_Mcols = d_Acols;
d_Mrows = d_Arows;
}
prec->initialize(matrix, jacMatrix, d_Arows, d_Acols);//TODO: do we need all parameters?
initialized = true;
} // end initialize()
@ -456,25 +465,7 @@ copy_system_to_gpu(Scalar *b)
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
sizeof(rocsparse_int) * (Nb + 1),
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
sizeof(rocsparse_int) * nnzbs_prec,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
}
prec->copy_system_to_gpu(d_Avals);
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
@ -500,19 +491,8 @@ update_system_on_gpu(Scalar* b)
HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
}
prec->update_system_on_gpu(d_Avals);
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
@ -528,58 +508,15 @@ template<class Scalar, unsigned int block_size>
bool rocsparseSolverBackend<Scalar,block_size>::
analyze_matrix()
{
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
ROCSPARSE_CHECK(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host));
ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
#endif
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_A));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(handle, dir, Nb, nnzbs_prec,
descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
d_bufferSize = std::max(d_bufferSize_M,
std::max(d_bufferSize_L, d_bufferSize_U));
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
// analysis of ilu LU decomposition
ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(handle, dir, \
Nb, nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
int zero_position = 0;
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
if (rocsparse_status_success != status) {
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
return false;
}
// analysis of ilu apply
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
Nb, nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
Nb, nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
#if HIP_VERSION >= 60000000
ROCSPARSE_CHECK(rocsparse_dbsrmv_analysis(handle, dir, operation,
@ -593,6 +530,13 @@ analyze_matrix()
block_size, spmv_info));
#endif
if(!prec->analyze_matrix(&*mat)) {
std::ostringstream out;
out << "Warning: rocsparseSolver matrix analysis failed!";
OpmLog::info(out.str());
return false;
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
@ -609,28 +553,7 @@ template<class Scalar, unsigned int block_size>
bool rocsparseSolverBackend<Scalar,block_size>::
create_preconditioner()
{
Timer t;
bool result = true;
ROCSPARSE_CHECK(rocsparse_dbsrilu0(handle, dir, Nb, nnzbs_prec, descr_M,
d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
// Check for zero pivot
int zero_position = 0;
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
if(rocsparse_status_success != status)
{
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
return false;
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream out;
out << "rocsparseSolver::create_preconditioner(): " << t.stop() << " s";
OpmLog::info(out.str());
}
return result;
return prec->create_preconditioner(&*mat);
} // end create_preconditioner()
template<class Scalar, unsigned int block_size>

View File

@ -26,6 +26,8 @@
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
#include <rocblas/rocblas.h>
#include <rocsparse/rocsparse.h>
@ -57,29 +59,26 @@ private:
bool analysis_done = false;
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
rocsparse_direction dir = rocsparse_direction_row;
rocsparse_operation operation = rocsparse_operation_none;
rocsparse_handle handle;
rocblas_handle blas_handle;
rocsparse_mat_descr descr_A, descr_M, descr_L, descr_U;
rocsparse_mat_info ilu_info;
rocsparse_mat_descr descr_A;
#if HIP_VERSION >= 50400000
rocsparse_mat_info spmv_info;
#endif
hipStream_t stream;
rocsparse_int *d_Arows, *d_Mrows;
rocsparse_int *d_Acols, *d_Mcols;
Scalar *d_Avals, *d_Mvals;
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
rocsparse_int *d_Arows, *d_Acols;
Scalar *d_Avals;
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
Scalar *d_pw, *d_s, *d_t, *d_v;
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
int ver;
char rev[64];
std::unique_ptr<rocsparsePreconditioner<Scalar, block_size> > prec; // can perform blocked ILU0 and AMG on pressure component
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
@ -119,14 +118,16 @@ public:
/// \param[in] tolerance required relative tolerance for rocsparseSolver
/// \param[in] platformID the OpenCL platform to be used
/// \param[in] deviceID the device to be used
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
rocsparseSolverBackend(int linear_solver_verbosity,
int maxit,
Scalar tolerance,
unsigned int platformID,
unsigned int deviceID);
unsigned int deviceID,
std::string linsolver);
/// For the CPR coarse solver
// rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_reorder);
/// Destroy a openclSolver, and free memory
~rocsparseSolverBackend();
@ -144,10 +145,6 @@ public:
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
/// Data is already on the GPU
// SolverStatus solve_system(BdaResult &res);
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(Scalar* x) override;