cleanup and run fix

This commit is contained in:
Razvan Nane 2024-06-05 15:00:47 +02:00
parent 7a307fafa0
commit dcbd9be46a
15 changed files with 490 additions and 341 deletions

View File

@ -47,7 +47,8 @@ CprCreation<Scalar, block_size>::CprCreation()
template <class Scalar, unsigned int block_size>
void CprCreation<Scalar, block_size>::
create_preconditioner_amg(BlockedMatrix<Scalar> *mat_) {
create_preconditioner_amg(BlockedMatrix<Scalar> *mat_)
{
mat = mat_;
cprNb = mat_->Nb;
cprnnzb = mat_->nnzbs;
@ -183,7 +184,9 @@ create_preconditioner_amg(BlockedMatrix<Scalar> *mat_) {
}
template <class Scalar, unsigned int block_size>
void CprCreation<Scalar, block_size>::analyzeHierarchy() {
void CprCreation<Scalar, block_size>::
analyzeHierarchy()
{
const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
// store coarsest AMG level in umfpack format, also performs LU decomposition
@ -237,8 +240,9 @@ void CprCreation<Scalar, block_size>::analyzeHierarchy() {
template <class Scalar, unsigned int block_size>
void CprCreation<Scalar, block_size>::analyzeAggregateMaps() {
void CprCreation<Scalar, block_size>::
analyzeAggregateMaps()
{
PcolIndices.resize(num_levels - 1);
Rmatrices.clear();

View File

@ -6,14 +6,16 @@
namespace Opm::Accelerator {
// divide A by B, and round up: return (int)ceil(A/B)
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
unsigned int ceilDivision(const unsigned int A,
const unsigned int B)
{
return A / B + (A % B > 0);
}
// return the absolute value of the N elements for which the absolute value is highest
template<class Scalar>
Scalar get_absmax(const Scalar* data, const int N)
Scalar get_absmax(const Scalar* data,
const int N)
{
return std::abs(*std::max_element(data, data + N,
[](Scalar a, Scalar b)
@ -22,7 +24,9 @@ Scalar get_absmax(const Scalar* data, const int N)
// solve A^T * x = b
template<class Scalar>
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
void solve_transposed_3x3(const Scalar* A,
const Scalar* b,
Scalar* x)
{
const int B = 3;
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]

View File

@ -1,15 +1,58 @@
#ifndef OPM_MISC_HPP
#define OPM_MISC_HPP
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCSPARSE_CHECK(STAT) \
do { \
const rocsparse_status stat = (STAT); \
if(stat != rocsparse_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocsparse "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCBLAS_CHECK(STAT) \
do { \
const rocblas_status stat = (STAT); \
if(stat != rocblas_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocblas "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm::Accelerator {
unsigned int ceilDivision(const unsigned int A, const unsigned int B);
unsigned int ceilDivision(const unsigned int A,
const unsigned int B);
template<class Scalar>
Scalar get_absmax(const Scalar *data, const int N);
Scalar get_absmax(const Scalar *data,
const int N);
template<class Scalar>
void solve_transposed_3x3(const Scalar *A, const Scalar *b, Scalar *x);
void solve_transposed_3x3(const Scalar *A,
const Scalar *b,
Scalar *x);
}

View File

@ -28,43 +28,26 @@
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <hip/hip_runtime.h>
#include <hip/hip_version.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm
{
namespace Opm {
using Opm::OpmLog;
using Dune::Timer;
// define static variables and kernels
int HipKernels::verbosity;
double* HipKernels::tmp;
bool HipKernels::initialized = false;
std::size_t HipKernels::preferred_workgroup_size_multiple = 0;
template<class Scalar> int HipKernels<Scalar>::verbosity = 0;
template<class Scalar> bool HipKernels<Scalar>::initialized = false;
#ifdef __HIP__
/// HIP kernel to multiply vector with another vector and a scalar, element-wise
// add result to a third vector
__global__ void vmul_k(
const double alpha,
double const *in1,
double const *in2,
double *out,
const int N)
template<class Scalar>
__global__ void vmul_k(const Scalar alpha,
Scalar const *in1,
Scalar const *in2,
Scalar *out,
const int N)
{
unsigned int NUM_THREADS = gridDim.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
@ -77,18 +60,18 @@ __global__ void vmul_k(
/// HIP kernel to transform blocked vector to scalar vector using pressure-weights
// every workitem handles one blockrow
__global__ void full_to_pressure_restriction_k(
const double *fine_y,
const double *weights,
double *coarse_y,
const unsigned int Nb)
template<class Scalar>
__global__ void full_to_pressure_restriction_k(const Scalar *fine_y,
const Scalar *weights,
Scalar *coarse_y,
const unsigned int Nb)
{
const unsigned int NUM_THREADS = gridDim.x;
const unsigned int block_size = 3;
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
while(target_block_row < Nb){
double sum = 0.0;
Scalar sum = 0.0;
unsigned int idx = block_size * target_block_row;
for (unsigned int i = 0; i < block_size; ++i) {
sum += fine_y[idx + i] * weights[idx + i];
@ -100,11 +83,11 @@ __global__ void full_to_pressure_restriction_k(
/// HIP kernel to add the coarse pressure solution back to the finer, complete solution
// every workitem handles one blockrow
__global__ void add_coarse_pressure_correction_k(
const double *coarse_x,
double *fine_x,
const unsigned int pressure_idx,
const unsigned int Nb)
template<class Scalar>
__global__ void add_coarse_pressure_correction_k(const Scalar *coarse_x,
Scalar *fine_x,
const unsigned int pressure_idx,
const unsigned int Nb)
{
const unsigned int NUM_THREADS = gridDim.x;
const unsigned int block_size = 3;
@ -118,11 +101,11 @@ __global__ void add_coarse_pressure_correction_k(
/// HIP kernel to prolongate vector during amg cycle
// every workitem handles one row
__global__ void prolongate_vector_k(
const double *in,
double *out,
const int *cols,
const unsigned int N)
template<class Scalar>
__global__ void prolongate_vector_k(const Scalar *in,
Scalar *out,
const int *cols,
const unsigned int N)
{
const unsigned int NUM_THREADS = gridDim.x;
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
@ -137,18 +120,17 @@ __global__ void prolongate_vector_k(
// algorithm based on:
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
__global__ void residual_blocked_k(
const double *vals,
const int *cols,
const int *rows,
const int Nb,
const double *x,
const double *rhs,
double *out,
const unsigned int block_size
)
template<class Scalar>
__global__ void residual_blocked_k(const Scalar *vals,
const int *cols,
const int *rows,
const int Nb,
const Scalar *x,
const Scalar *rhs,
Scalar *out,
const unsigned int block_size)
{
extern __shared__ double tmp[];
extern __shared__ Scalar tmp[];
const unsigned int warpsize = warpSize;
const unsigned int bsize = blockDim.x;
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
@ -174,12 +156,12 @@ __global__ void residual_blocked_k(
unsigned int first_block = rows[target_block_row];
unsigned int last_block = rows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
Scalar local_out = 0.0;
if(lane < num_active_threads){
for(; block < last_block; block += num_blocks_per_warp){
double x_elem = x[cols[block]*bs + c];
double A_elem = vals[block*bs*bs + c + r*bs];
Scalar x_elem = x[cols[block]*bs + c];
Scalar A_elem = vals[block*bs*bs + c + r*bs];
local_out += x_elem * A_elem;
}
}
@ -210,17 +192,16 @@ __global__ void residual_blocked_k(
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
// template <unsigned shared_mem_size>
__global__ void residual_k(
const double *vals,
const int *cols,
const int *rows,
const int N,
const double *x,
const double *rhs,
double *out
)
template<class Scalar>
__global__ void residual_k(const Scalar *vals,
const int *cols,
const int *rows,
const int N,
const Scalar *x,
const Scalar *rhs,
Scalar *out)
{
extern __shared__ double tmp[];
extern __shared__ Scalar tmp[];
const unsigned int bsize = blockDim.x;
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idx_b = gid / bsize;
@ -233,7 +214,7 @@ __global__ void residual_k(
int rowStart = rows[row];
int rowEnd = rows[row+1];
int rowLength = rowEnd - rowStart;
double local_sum = 0.0;
Scalar local_sum = 0.0;
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
int col = cols[j];
local_sum += vals[j] * x[col];
@ -259,16 +240,15 @@ __global__ void residual_k(
}
}
__global__ void spmv_k(
const double *vals,
const int *cols,
const int *rows,
const int N,
const double *x,
double *out
)
template<class Scalar>
__global__ void spmv_k(const Scalar *vals,
const int *cols,
const int *rows,
const int N,
const Scalar *x,
Scalar *out)
{
extern __shared__ double tmp[];
extern __shared__ Scalar tmp[];
const unsigned int bsize = blockDim.x;
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idx_b = gid / bsize;
@ -281,7 +261,7 @@ __global__ void spmv_k(
int rowStart = rows[row];
int rowEnd = rows[row+1];
int rowLength = rowEnd - rowStart;
double local_sum = 0.0;
Scalar local_sum = 0.0;
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
int col = cols[j];
local_sum += vals[j] * x[col];
@ -308,7 +288,9 @@ __global__ void spmv_k(
}
#endif
void HipKernels::init(int verbosity_)
template<class Scalar>
void HipKernels<Scalar>::
init(int verbosity_)
{
if (initialized) {
OpmLog::debug("Warning HipKernels is already initialized");
@ -320,7 +302,14 @@ void HipKernels::init(int verbosity_)
initialized = true;
}
void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream)
template<class Scalar>
void HipKernels<Scalar>::
vmul(const Scalar alpha,
Scalar* in1,
Scalar* in2,
Scalar* out,
int N,
hipStream_t stream)
{
Timer t_vmul;
#ifdef __HIP__
@ -342,7 +331,13 @@ void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out,
}
}
void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream)
template<class Scalar>
void HipKernels<Scalar>::
full_to_pressure_restriction(const Scalar* fine_y,
Scalar* weights,
Scalar* coarse_y,
int Nb,
hipStream_t stream)
{
Timer t;
#ifdef __HIP__
@ -364,7 +359,13 @@ void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weig
}
}
void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream)
template<class Scalar>
void HipKernels<Scalar>::
add_coarse_pressure_correction(Scalar* coarse_x,
Scalar* fine_x,
int pressure_idx,
int Nb,
hipStream_t stream)
{
Timer t;
#ifdef __HIP__
@ -386,7 +387,13 @@ void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x
}
}
void HipKernels::prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream)
template<class Scalar>
void HipKernels<Scalar>::
prolongate_vector(const Scalar* in,
Scalar* out,
const int* cols,
int N,
hipStream_t stream)
{
Timer t;
@ -394,7 +401,7 @@ void HipKernels::prolongate_vector(const double* in, double* out, const int* col
unsigned blockDim = 64;
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
unsigned gridDim = blocks * blockDim;
unsigned shared_mem_size = blockDim * sizeof(double);
unsigned shared_mem_size = blockDim * sizeof(Scalar);
// dim3(N) will create a vector {N, 1, 1}
prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
@ -410,7 +417,17 @@ void HipKernels::prolongate_vector(const double* in, double* out, const int* col
}
}
void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream)
template<class Scalar>
void HipKernels<Scalar>::
residual(Scalar* vals,
int* cols,
int* rows,
Scalar* x,
const Scalar* rhs,
Scalar* out,
int Nb,
unsigned int block_size,
hipStream_t stream)
{
Timer t_residual;
@ -418,7 +435,7 @@ void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const d
unsigned blockDim = 64;
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
unsigned gridDim = num_work_groups * blockDim;
unsigned shared_mem_size = blockDim * sizeof(double);
unsigned shared_mem_size = blockDim * sizeof(Scalar);
if (block_size > 1) {
// dim3(N) will create a vector {N, 1, 1}
@ -439,14 +456,23 @@ void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const d
}
}
void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream)
template<class Scalar>
void HipKernels<Scalar>::
spmv(Scalar* vals,
int* cols,
int* rows,
Scalar* x,
Scalar* y,
int Nb,
unsigned int block_size,
hipStream_t stream)
{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
Timer t_spmv;
#ifdef __HIP__
unsigned blockDim = 64;
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
unsigned gridDim = num_work_groups * blockDim;
unsigned shared_mem_size = blockDim * sizeof(double);
unsigned shared_mem_size = blockDim * sizeof(Scalar);
spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
@ -462,5 +488,6 @@ void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y,
}
}
template class HipKernels<double>;
} // namespace Opm

View File

@ -17,8 +17,8 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef HIPKERNELS_HPP
#define HIPKERNELS_HPP
#ifndef OPM_HIPKERNELS_HPP
#define OPM_HIPKERNELS_HPP
#include <string>
#include <memory>
@ -27,29 +27,110 @@
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
namespace Opm
{
namespace Opm {
template<class Scalar>
class HipKernels
{
private:
static int verbosity;
static double* tmp; // used as tmp CPU buffer for dot() and norm()
static int verbosity;
static bool initialized;
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
HipKernels(){}; // disable instantiation
HipKernels();
public:
/// Initialize verbosity level for the HIP kernels
/// \param[in] verbosity verbosity level
static void init(int verbosity);
static void full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream);
static void add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream);
static void vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream);
static void prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream);
static void residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream);
static void spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream);
/// Transform blocked vector to scalar vector using pressure-weights, where every workitem handles one blockrow
/// \param[in] fine_y Input y vector
/// \param[in] weights Weights used to combine cells
/// \param[out] course_y Output y vector
/// \param[in] Nb Number of blocks in the original matrix
/// \param[in] stream Hip stream to use for the computations
static void full_to_pressure_restriction(const Scalar* fine_y,
Scalar* weights,
Scalar* coarse_y,
int Nb,
hipStream_t stream);
/// Add the coarse pressure solution back to the finer, complete solution; every workitem handles one blockrow
/// \param[in] coarse_x Input scalar x vector
/// \param[out] fine_x Output blocked x vector
/// \param[in] pressure_idx Pressure index
/// \param[in] Nb Number of blocks in the original matrix
/// \param[in] stream Hip stream to use for the computations
static void add_coarse_pressure_correction(Scalar* coarse_x,
Scalar* fine_x,
int pressure_idx,
int Nb,
hipStream_t stream);
/// Function to multiply vector with another vector and a scalar, element-wise and add the result to a third vector (out = alpha * in1 + in2)
/// \param[in] alpha Input scalar
/// \param[in] in1 First input vector
/// \param[in] in2 Second input vector
/// \param[out] out Output vector
/// \param[in] N Size of the vector
/// \param[in] stream Hip stream to use for the computations
static void vmul(const Scalar alpha,
Scalar* in1,
Scalar* in2,
Scalar* out,
int N,
hipStream_t stream);
/// Function to prolongate vector during amg cycle, every workitem handles one row
/// \param[in] in Input fine-grained vector
/// \param[out] out Output course-graned vector
/// \param[in] cols Column indexes
/// \param[in] N Size of the vector
/// \param[in] stream Hip stream to use for the computations
static void prolongate_vector(const Scalar* in,
Scalar* out,
const int* cols,
int N,
hipStream_t stream);
/// Function to perform res = rhs - mat * x
/// \param[in] vals Matrix values
/// \param[in] cols Column indexes
/// \param[in] rows Row pointers
/// \param[in] x X vector
/// \param[in] rhs Rhs vector
/// \param[out] out Output res vector
/// \param[in] Nb Number of non-zero blocks in the original matrix
/// \param[in] block_size Block size
/// \param[in] stream Hip stream to use for the computations
static void residual(Scalar* vals,
int* cols,
int* rows,
Scalar* x,
const Scalar* rhs,
Scalar* out,
int Nb,
unsigned int block_size,
hipStream_t stream);
/// Function to perform sparse matrix vector multipliation
/// \param[in] vals Matrix values
/// \param[in] cols Column indexes
/// \param[in] rows Row pointers
/// \param[in] x Input x vector
/// \param[out] y Output y vector
/// \param[in] Nb Number of non-zero blocks in the original matrix
/// \param[in] block_size Block size
/// \param[in] stream Hip stream to use for the computations
static void spmv(Scalar* vals,
int* cols,
int* rows,
Scalar* x,
Scalar* y,
int Nb,
unsigned int block_size,
hipStream_t stream);
};
} // namespace Opm

View File

@ -26,48 +26,11 @@
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <sstream>
#include <iostream> //Razvan
#include <hip/hip_runtime_api.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseBILU0::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCSPARSE_CHECK(STAT) \
do { \
const rocsparse_status stat = (STAT); \
if(stat != rocsparse_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseBILU0::rocsparse "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCBLAS_CHECK(STAT) \
do { \
const rocblas_status stat = (STAT); \
if(stat != rocblas_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseBILU0::rocblas "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
@ -88,7 +51,11 @@ rocsparseBILU0(int verbosity_) :
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
rocsparse_int *d_Arows,
rocsparse_int *d_Acols)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
@ -101,14 +68,14 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<Blocke
this->jacMat = jacMatrix;
}
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * this->N));
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * this->N));
if (this->useJacMatrix) {
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * this->nnzbs_prec));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * this->nnzbs_prec * block_size * block_size));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
} else { // preconditioner matrix is same
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * this->nnzbs_prec * block_size * block_size));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
d_Mcols = d_Acols;
d_Mrows = d_Arows;
}
@ -125,10 +92,13 @@ analyze_matrix(BlockedMatrix<Scalar> *mat) {
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) {
analyze_matrix(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat)
{
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
#if HIP_VERSION >= 50400000
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
#endif
@ -193,7 +163,9 @@ create_preconditioner(BlockedMatrix<Scalar> *mat) {
template <class Scalar, unsigned int block_size>
bool rocsparseBILU0<Scalar, block_size>::
create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) {
create_preconditioner(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat)
{
Timer t;
bool result = true;
@ -219,7 +191,7 @@ create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat)
template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
copy_system_to_gpu(double *d_Avals) {
copy_system_to_gpu(Scalar *d_Avals) {
Timer t;
if (this->useJacMatrix) {
@ -230,9 +202,9 @@ copy_system_to_gpu(double *d_Avals) {
#endif
HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(double) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
}
if (verbosity >= 3) {
@ -246,7 +218,7 @@ copy_system_to_gpu(double *d_Avals) {
// don't copy rowpointers and colindices, they stay the same
template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
update_system_on_gpu(double *d_Avals) {
update_system_on_gpu(Scalar *d_Avals) {
Timer t;
if (this->useJacMatrix) {
@ -255,9 +227,9 @@ update_system_on_gpu(double *d_Avals) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(double) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
}
if (verbosity >= 3) {
@ -270,9 +242,9 @@ update_system_on_gpu(double *d_Avals) {
template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
apply(double& y, double& x) {
double zero = 0.0;
double one = 1.0;
apply(Scalar& y, Scalar& x) {
Scalar zero = 0.0;
Scalar one = 1.0;
Timer t_apply;

View File

@ -57,7 +57,7 @@ private:
#endif
rocsparse_int *d_Mrows, *d_Mcols;
double *d_Mvals, *d_t;
Scalar *d_Mvals, *d_t;
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
public:
@ -67,29 +67,57 @@ public:
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
rocsparse_int *d_Arows,
rocsparse_int *d_Acols);
// analysis, extract parallelism if specified
/// Analysis, extract parallelism if specified
/// \param[in] mat matrix A
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
/// Analysis, extract parallelism if specified
/// \param[in] mat matrix A
/// \param[in] jacMat matrix for preconditioner, analyze this as well
bool analyze_matrix(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat);
// ilu_decomposition
/// ILU decomposition
/// \param[in] mat matrix A to decompose
bool create_preconditioner(BlockedMatrix<Scalar> *mat) override;
bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) override;
/// ILU decomposition
/// \param[in] mat matrix A
/// \param[in] jacMat matrix for preconditioner, decompose this one if used
bool create_preconditioner(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat) override;
// apply preconditioner, x = prec(y)
// via Lz = y
// and Ux = z
void apply(double& y, double& x) override;
/// Apply preconditioner, x = prec(y)
/// via Lz = y
/// and Ux = z
/// \param[in] y Input y vector
/// \param[out] x Output x vector
void apply(Scalar& y,
Scalar& x) override;
#if HAVE_OPENCL
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) {}
void apply(const cl::Buffer& y,
cl::Buffer& x) {}
#endif
void copy_system_to_gpu(double *mVals);
void update_system(double *vals, double *b);
void update_system_on_gpu(double *b);
/// Copy matrix A values to GPU
/// \param[in] mVals Input values
void copy_system_to_gpu(Scalar *mVals);
/// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
// /// \param[in] vals New values
// /// \param[in] b New b vector
// void update_system(Scalar *vals, Scalar *b);
/// Update GPU values after a new assembly is done
/// \param[in] b New b vector
void update_system_on_gpu(Scalar *b);
};
} // namespace Opm

View File

@ -35,45 +35,6 @@
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <hip/hip_runtime.h>
#include <hip/hip_version.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCSPARSE_CHECK(STAT) \
do { \
const rocsparse_status stat = (STAT); \
if(stat != rocsparse_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocsparse "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCBLAS_CHECK(STAT) \
do { \
const rocblas_status stat = (STAT); \
if(stat != rocblas_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocblas "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm::Accelerator {
using Opm::OpmLog;
@ -88,7 +49,11 @@ rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
rocsparse_int *d_Arows,
rocsparse_int *d_Acols)
{
this->Nb = matrix->Nb;
this->nnzb = matrix->nnzbs;
this->N = Nb * block_size;
@ -129,7 +94,9 @@ analyze_matrix(BlockedMatrix<Scalar> *mat_) {
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::
analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
analyze_matrix(BlockedMatrix<Scalar> *mat_,
BlockedMatrix<Scalar> *jacMat)
{
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
@ -142,7 +109,10 @@ analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
}
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::create_preconditioner(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
bool rocsparseCPR<Scalar, block_size>::
create_preconditioner(BlockedMatrix<Scalar> *mat_,
BlockedMatrix<Scalar> *jacMat)
{
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_, jacMat);
if (verbosity >= 3) {
@ -205,7 +175,7 @@ init_rocm_buffers() {
d_Rmatrices.reserve(this->num_levels - 1);
d_invDiags.reserve(this->num_levels - 1);
for (Matrix<Scalar>& m : this->Amatrices) {
d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);//TOCHECK: this memory might not be contigous!!
d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);
}
for (Matrix<Scalar>& m : this->Rmatrices) {
@ -246,7 +216,10 @@ rocm_upload() {
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
amg_cycle_gpu(const int level,
Scalar &y,
Scalar &x)
{
RocmMatrix<Scalar> *A = &d_Amatrices[level];
RocmMatrix<Scalar> *R = &d_Rmatrices[level];
int Ncur = A->Nb;
@ -281,52 +254,56 @@ amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
// presmooth
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
}
// move to coarser level
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
// TODO: understand why rocsparse spmv library function does not here.
// ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
// R->Nb, R->Mb, R->nnzbs, &one, descr_R,
// R->nnzValues, R->rowPointers, R->colIndices, 1,
// t.nnzValues, &zero, f.nnzValues));
HipKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
HipKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
HipKernels::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
HipKernels<Scalar>::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
// postsmooth
for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
}
}
// x = prec(y)
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
apply_amg(const Scalar& y, Scalar& x) {
apply_amg(const Scalar& y,
Scalar& x)
{
HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
for (unsigned int i = 0; i < d_u.size(); ++i) {
d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
}
HipKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
HipKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
HipKernels::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
HipKernels<Scalar>::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));
HipKernels::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
HipKernels<Scalar>::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
apply(Scalar& y, Scalar& x) {
apply(Scalar& y,
Scalar& x)
{
Dune::Timer t_bilu0;
bilu0->apply(y, x);

View File

@ -64,25 +64,69 @@ private:
std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar
// Initialize and allocate matrices and vectors
void init_rocm_buffers();//TODO: rename to rocm/c and update in code
void init_rocm_buffers();
// Copy matrices and vectors to GPU
void rocm_upload();//TODO: rename to rocm/c and update in code
void rocm_upload();
// apply pressure correction to vector
void apply_amg(const Scalar& y, Scalar& x);
// Apply the AMG preconditioner
void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
public:
rocsparseCPR(int verbosity);
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
rocsparse_int *d_Arows,
rocsparse_int *d_Acols);
/// Analysis, extract parallelism if specified
/// \param[in] mat matrix A
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
/// Analysis, extract parallelism if specified
/// \param[in] mat matrix A
/// \param[in] jacMat matrix for preconditioner, analyze this as well
bool analyze_matrix(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat);
/// Create AMG preconditioner and perform ILU decomposition
/// \param[in] mat matrix A
bool create_preconditioner(BlockedMatrix<Scalar> *mat);
/// Create AMG preconditioner and perform ILU decomposition
/// \param[in] mat matrix A
/// \param[in] jacMat matrix for preconditioner, decompose this one if used
bool create_preconditioner(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat);
/// Apply preconditioner, x = prec(y)
/// applies blocked ilu0
/// also applies amg for pressure component
/// \param[in] y Input y vector
/// \param[out] x Output x vector
void apply(Scalar& y,
Scalar& x) override;
#if HAVE_OPENCL
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y,
cl::Buffer& x) {}
#endif
/// Copy matrix A values to GPU
/// \param[in] mVals Input values
void copy_system_to_gpu(Scalar *b);
/// Reassign pointers, in case the addresses of the Dune variables have changed
/// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vector b, contains N values
// void update_system(Scalar *vals, Scalar *b);
@ -90,20 +134,6 @@ public:
/// Update linear system to GPU
/// \param[in] b input vector, contains N values
void update_system_on_gpu(Scalar *b);
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
bool create_preconditioner(BlockedMatrix<Scalar> *mat);
bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
#if HAVE_OPENCL
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) {}
#endif
// applies blocked ilu0
// also applies amg for pressure component
void apply(Scalar& y, Scalar& x);
};
} // namespace Opm

View File

@ -25,29 +25,19 @@
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <sstream>
#include <iostream>
// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm::Accelerator {
template<class Scalar>
RocmMatrix<Scalar>::
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
RocmMatrix(int Nb_,
int Mb_,
int nnzbs_,
unsigned int block_size_)
: Nb(Nb_),
Mb(Mb_),
nnzbs(nnzbs_),
@ -62,7 +52,11 @@ RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
template <class Scalar>
void RocmMatrix<Scalar>::
upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {
upload(Scalar *vals,
int *cols,
int *rows,
hipStream_t stream)
{
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
@ -72,7 +66,9 @@ upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {
template <class Scalar>
void RocmMatrix<Scalar>::
upload(Matrix<Scalar> *matrix, hipStream_t stream) {
upload(Matrix<Scalar> *matrix,
hipStream_t stream)
{
if (block_size != 1) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
}
@ -82,7 +78,9 @@ upload(Matrix<Scalar> *matrix, hipStream_t stream) {
template <class Scalar>
void RocmMatrix<Scalar>::
upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream) {
upload(BlockedMatrix<Scalar> *matrix,
hipStream_t stream)
{
if (matrix->block_size != block_size) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
}
@ -99,7 +97,9 @@ RocmVector<Scalar>::RocmVector(int N)
template <class Scalar>
void RocmVector<Scalar>::
upload(Scalar *vals, hipStream_t stream) {
upload(Scalar *vals,
hipStream_t stream)
{
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));
}

View File

@ -21,24 +21,29 @@
#define OPM_ROCMMATRIX_HEADER_INCLUDED
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
namespace Opm::Accelerator {
template<class Scalar> class Matrix;
template<class Scalar> class BlockedMatrix;
/// This struct resembles a csr matrix, only doubles are supported
/// The matrix data is stored in OpenCL Buffers
/// This struct resembles a csr matrix
template<class Scalar>
class RocmMatrix {
public:
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);
void upload(Scalar *vals, int *cols, int *rows, hipStream_t stream);
void upload(Matrix<Scalar> *matrix, hipStream_t stream);
void upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream);
void upload(Scalar *vals,
int *cols,
int *rows,
hipStream_t stream);
void upload(Matrix<Scalar> *matrix,
hipStream_t stream);
void upload(BlockedMatrix<Scalar> *matrix,
hipStream_t stream);
Scalar* nnzValues;
int* colIndices;
@ -54,8 +59,11 @@ public:
RocmVector(int N);
void upload(Scalar *vals, hipStream_t stream);
void upload(Matrix<Scalar> *matrix, hipStream_t stream);
void upload(Scalar *vals,
hipStream_t stream);
void upload(Matrix<Scalar> *matrix,
hipStream_t stream);
Scalar* nnzValues;
int size;

View File

@ -23,18 +23,20 @@
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
// #include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
namespace Opm::Accelerator {
template <class Scalar, unsigned int block_size>
std::unique_ptr<rocsparsePreconditioner<Scalar,block_size> > rocsparsePreconditioner<Scalar,block_size>::
create(PreconditionerType type, int verbosity) {
create(PreconditionerType type,
int verbosity)
{
if (type == PreconditionerType::BILU0) {
return std::make_unique<Opm::Accelerator::rocsparseBILU0<Scalar, block_size> >(verbosity);
// } else if (type == PreconditionerType::CPR) {
// return std::make_unique<Opm::Accelerator::rocsparseCPR<block_size> >(verbosity);
} else if (type == PreconditionerType::CPR) {
return std::make_unique<Opm::Accelerator::rocsparseCPR<Scalar, block_size> >(verbosity);
} else {
OPM_THROW(std::logic_error, "Invalid PreconditionerType");
}
@ -42,14 +44,20 @@ create(PreconditionerType type, int verbosity) {
template <class Scalar, unsigned int block_size>
void rocsparsePreconditioner<Scalar, block_size>::
set_matrix_analysis(rocsparse_mat_descr descr_L, rocsparse_mat_descr descr_U) {
set_matrix_analysis(rocsparse_mat_descr descr_L,
rocsparse_mat_descr descr_U)
{
descr_L = descr_L;
descr_U = descr_U;
}
template <class Scalar, unsigned int block_size>
void rocsparsePreconditioner<Scalar, block_size>::
set_context(rocsparse_handle handle, rocsparse_direction dir, rocsparse_operation operation, hipStream_t stream) {
set_context(rocsparse_handle handle,
rocsparse_direction dir,
rocsparse_operation operation,
hipStream_t stream)
{
this->handle = handle;
this->dir = dir;
this->operation = operation;

View File

@ -52,7 +52,8 @@ public:
virtual ~rocsparsePreconditioner() = default;
static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type, int verbosity);
static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type,
int verbosity);
// apply preconditioner, x = prec(y)
virtual void apply(double& y, double& x) = 0;
@ -60,9 +61,14 @@ public:
// create/update preconditioner, probably used every linear solve
// the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) = 0;
virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) = 0;
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat,
BlockedMatrix<Scalar> *jacMat) = 0;
virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
rocsparse_int *d_Arows,
rocsparse_int *d_Acols) = 0;
virtual void copy_system_to_gpu(double *b)=0;
@ -70,8 +76,14 @@ public:
/// \param[in] b input vector, contains N values
virtual void update_system_on_gpu(double *b)=0;
void set_matrix_analysis(rocsparse_mat_descr descr_L, rocsparse_mat_descr descr_U);
void set_context(rocsparse_handle handle, rocsparse_direction dir, rocsparse_operation operation, hipStream_t stream);
void set_matrix_analysis(rocsparse_mat_descr descr_L,
rocsparse_mat_descr descr_U);
void set_context(rocsparse_handle handle,
rocsparse_direction dir,
rocsparse_operation operation,
hipStream_t stream);
void setJacMat(BlockedMatrix<Scalar> jacMat);
};
} //namespace Opm

View File

@ -43,50 +43,13 @@
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
#include <opm/simulators/linalg/bda/Misc.hpp>
#ifdef HIP_HAVE_CUDA_DEFINED
#define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
#undef HIP_HAVE_CUDA_DEFINED
#endif
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCSPARSE_CHECK(STAT) \
do { \
const rocsparse_status stat = (STAT); \
if(stat != rocsparse_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocsparse "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCBLAS_CHECK(STAT) \
do { \
const rocblas_status stat = (STAT); \
if(stat != rocblas_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocblas "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#include <cstddef>
namespace Opm::Accelerator {
@ -462,7 +425,7 @@ copy_system_to_gpu(Scalar *b)
sizeof(Scalar) * nnz,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar),
hipMemcpyHostToDevice, stream));
prec->copy_system_to_gpu(d_Avals);

View File

@ -40,17 +40,9 @@
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <hip/hip_runtime.h>
#define HIP_CHECK(stat) \
{ \
if(stat != hipSuccess) \
{ \
OPM_THROW(std::logic_error, "HIP error"); \
} \
}
namespace Opm
{