add rocsparseCPR preconditioner

This commit is contained in:
Razvan Nane
2024-06-04 17:13:13 +02:00
parent 74b30e6bba
commit 7a307fafa0
9 changed files with 1184 additions and 17 deletions

View File

@@ -266,10 +266,13 @@ if(USE_BDA_BRIDGE)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp)
endif()
if(rocsparse_FOUND AND rocblas_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/hipKernels.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp)
endif()
if(CUDA_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
@@ -673,11 +676,14 @@ if (USE_BDA_BRIDGE)
opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
opm/simulators/linalg/bda/Matrix.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/rocm/hipKernels.hpp
opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/ISTLSolverBda.hpp
)

View File

@@ -0,0 +1,466 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <cmath>
#include <sstream>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <hip/hip_runtime.h>
#include <hip/hip_version.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm
{
using Opm::OpmLog;
using Dune::Timer;
// define static variables and kernels
int HipKernels::verbosity;
double* HipKernels::tmp;
bool HipKernels::initialized = false;
std::size_t HipKernels::preferred_workgroup_size_multiple = 0;
#ifdef __HIP__
/// HIP kernel to multiply vector with another vector and a scalar, element-wise
// add result to a third vector
__global__ void vmul_k(
const double alpha,
double const *in1,
double const *in2,
double *out,
const int N)
{
unsigned int NUM_THREADS = gridDim.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
while(idx < N){
out[idx] += alpha * in1[idx] * in2[idx];
idx += NUM_THREADS;
}
}
/// HIP kernel to transform blocked vector to scalar vector using pressure-weights
// every workitem handles one blockrow
__global__ void full_to_pressure_restriction_k(
const double *fine_y,
const double *weights,
double *coarse_y,
const unsigned int Nb)
{
const unsigned int NUM_THREADS = gridDim.x;
const unsigned int block_size = 3;
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
while(target_block_row < Nb){
double sum = 0.0;
unsigned int idx = block_size * target_block_row;
for (unsigned int i = 0; i < block_size; ++i) {
sum += fine_y[idx + i] * weights[idx + i];
}
coarse_y[target_block_row] = sum;
target_block_row += NUM_THREADS;
}
}
/// HIP kernel to add the coarse pressure solution back to the finer, complete solution
// every workitem handles one blockrow
__global__ void add_coarse_pressure_correction_k(
const double *coarse_x,
double *fine_x,
const unsigned int pressure_idx,
const unsigned int Nb)
{
const unsigned int NUM_THREADS = gridDim.x;
const unsigned int block_size = 3;
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
while(target_block_row < Nb){
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
target_block_row += NUM_THREADS;
}
}
/// HIP kernel to prolongate vector during amg cycle
// every workitem handles one row
__global__ void prolongate_vector_k(
const double *in,
double *out,
const int *cols,
const unsigned int N)
{
const unsigned int NUM_THREADS = gridDim.x;
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
while(row < N){
out[row] += in[cols[row]];
row += NUM_THREADS;
}
}
// res = rhs - mat * x
// algorithm based on:
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
__global__ void residual_blocked_k(
const double *vals,
const int *cols,
const int *rows,
const int Nb,
const double *x,
const double *rhs,
double *out,
const unsigned int block_size
)
{
extern __shared__ double tmp[];
const unsigned int warpsize = warpSize;
const unsigned int bsize = blockDim.x;
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idx_b = gid / bsize;
const unsigned int idx_t = threadIdx.x;
unsigned int idx = idx_b * bsize + idx_t;
const unsigned int bs = block_size;
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = gridDim.x;
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int target_block_row = idx / warpsize;
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
// for 3x3 blocks:
// num_active_threads: 27 (CUDA) vs 63 (ROCM)
// num_blocks_per_warp: 3 (CUDA) vs 7 (ROCM)
int offsetTarget = warpsize == 64 ? 48 : 32;
while(target_block_row < Nb){
unsigned int first_block = rows[target_block_row];
unsigned int last_block = rows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
for(; block < last_block; block += num_blocks_per_warp){
double x_elem = x[cols[block]*bs + c];
double A_elem = vals[block*bs*bs + c + r*bs];
local_out += x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
for(unsigned int offset = 3; offset <= offsetTarget; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
__syncthreads();
}
if(lane < bs){
unsigned int row = target_block_row*bs + lane;
out[row] = rhs[row] - tmp[lane];
}
target_block_row += num_warps_in_grid;
}
}
// KERNEL residual_k
// res = rhs - mat * x
// algorithm based on:
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
// template <unsigned shared_mem_size>
__global__ void residual_k(
const double *vals,
const int *cols,
const int *rows,
const int N,
const double *x,
const double *rhs,
double *out
)
{
extern __shared__ double tmp[];
const unsigned int bsize = blockDim.x;
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idx_b = gid / bsize;
const unsigned int idx_t = threadIdx.x;
const unsigned int num_workgroups = gridDim.x;
int row = idx_b;
while (row < N) {
int rowStart = rows[row];
int rowEnd = rows[row+1];
int rowLength = rowEnd - rowStart;
double local_sum = 0.0;
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
int col = cols[j];
local_sum += vals[j] * x[col];
}
tmp[idx_t] = local_sum;
__syncthreads();
int offset = bsize / 2;
while(offset > 0) {
if (idx_t < offset) {
tmp[idx_t] += tmp[idx_t + offset];
}
__syncthreads();
offset = offset / 2;
}
if (idx_t == 0) {
out[row] = rhs[row] - tmp[idx_t];
}
row += num_workgroups;
}
}
__global__ void spmv_k(
const double *vals,
const int *cols,
const int *rows,
const int N,
const double *x,
double *out
)
{
extern __shared__ double tmp[];
const unsigned int bsize = blockDim.x;
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int idx_b = gid / bsize;
const unsigned int idx_t = threadIdx.x;
const unsigned int num_workgroups = gridDim.x;
int row = idx_b;
while (row < N) {
int rowStart = rows[row];
int rowEnd = rows[row+1];
int rowLength = rowEnd - rowStart;
double local_sum = 0.0;
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
int col = cols[j];
local_sum += vals[j] * x[col];
}
tmp[idx_t] = local_sum;
__syncthreads();
int offset = bsize / 2;
while(offset > 0) {
if (idx_t < offset) {
tmp[idx_t] += tmp[idx_t + offset];
}
__syncthreads();
offset = offset / 2;
}
if (idx_t == 0) {
out[row] = tmp[idx_t];
}
row += num_workgroups;
}
}
#endif
void HipKernels::init(int verbosity_)
{
if (initialized) {
OpmLog::debug("Warning HipKernels is already initialized");
return;
}
verbosity = verbosity_;
initialized = true;
}
void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream)
{
Timer t_vmul;
#ifdef __HIP__
unsigned blockDim = 64;
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
unsigned gridDim = blocks * blockDim;
// dim3(N) will create a vector {N, 1, 1}
vmul_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(alpha, in1, in2, out, N);
HIP_CHECK(hipStreamSynchronize(stream));
#else
OPM_THROW(std::logic_error, "Error vmul for rocsparse only supported when compiling with hipcc");
#endif
if (verbosity >= 4) {
std::ostringstream oss;
oss << std::scientific << "HipKernels vmul() time: " << t_vmul.stop() << " s";
OpmLog::info(oss.str());
}
}
void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream)
{
Timer t;
#ifdef __HIP__
unsigned blockDim = 64;
unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
unsigned gridDim = blocks * blockDim;
// dim3(N) will create a vector {N, 1, 1}
full_to_pressure_restriction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(fine_y, weights, coarse_y, Nb);
HIP_CHECK(hipStreamSynchronize(stream));
#else
OPM_THROW(std::logic_error, "Error full_to_pressure_restriction for rocsparse only supported when compiling with hipcc");
#endif
if (verbosity >= 4) {
std::ostringstream oss;
oss << std::scientific << "HipKernels full_to_pressure_restriction() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream)
{
Timer t;
#ifdef __HIP__
unsigned blockDim = 64;
unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
unsigned gridDim = blocks * blockDim;
// dim3(N) will create a vector {N, 1, 1}
add_coarse_pressure_correction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(coarse_x, fine_x, pressure_idx, Nb);
HIP_CHECK(hipStreamSynchronize(stream));
#else
OPM_THROW(std::logic_error, "Error add_coarse_pressure_correction for rocsparse only supported when compiling with hipcc");
#endif
if (verbosity >= 4) {
std::ostringstream oss;
oss << std::scientific << "HipKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void HipKernels::prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream)
{
Timer t;
#ifdef __HIP__
unsigned blockDim = 64;
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
unsigned gridDim = blocks * blockDim;
unsigned shared_mem_size = blockDim * sizeof(double);
// dim3(N) will create a vector {N, 1, 1}
prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
HIP_CHECK(hipStreamSynchronize(stream));
#else
OPM_THROW(std::logic_error, "Error prolongate_vector for rocsparse only supported when compiling with hipcc");
#endif
if (verbosity >= 4) {
std::ostringstream oss;
oss << std::scientific << "HipKernels prolongate_vector() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream)
{
Timer t_residual;
#ifdef __HIP__
unsigned blockDim = 64;
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
unsigned gridDim = num_work_groups * blockDim;
unsigned shared_mem_size = blockDim * sizeof(double);
if (block_size > 1) {
// dim3(N) will create a vector {N, 1, 1}
residual_blocked_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out, block_size);
} else {
residual_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out);
}
HIP_CHECK(hipStreamSynchronize(stream));
#else
OPM_THROW(std::logic_error, "Error residual for rocsparse only supported when compiling with hipcc");
#endif
if (verbosity >= 4) {
HIP_CHECK(hipStreamSynchronize(stream));
std::ostringstream oss;
oss << std::scientific << "HipKernels residual() time: " << t_residual.stop() << " s";
OpmLog::info(oss.str());
}
}
void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream)
{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
Timer t_spmv;
#ifdef __HIP__
unsigned blockDim = 64;
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
unsigned gridDim = num_work_groups * blockDim;
unsigned shared_mem_size = blockDim * sizeof(double);
spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
HIP_CHECK(hipStreamSynchronize(stream));
#else
OPM_THROW(std::logic_error, "Error spmv for rocsparse only supported when compiling with hipcc");
#endif
if (verbosity >= 4) {
std::ostringstream oss;
oss << std::scientific << "HipKernels spmv_blocked() time: " << t_spmv.stop() << " s";
OpmLog::info(oss.str());
}
}
} // namespace Opm

View File

@@ -0,0 +1,57 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef HIPKERNELS_HPP
#define HIPKERNELS_HPP
#include <string>
#include <memory>
#include <cstddef>
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
namespace Opm
{
class HipKernels
{
private:
static int verbosity;
static double* tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized;
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
HipKernels(){}; // disable instantiation
public:
static void init(int verbosity);
static void full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream);
static void add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream);
static void vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream);
static void prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream);
static void residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream);
static void spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream);
};
} // namespace Opm
#endif

View File

@@ -301,17 +301,4 @@ apply(double& y, double& x) {
template class rocsparseBILU0<T,6>;
INSTANCE_TYPE(double)
// #define INSTANTIATE_BDA_FUNCTIONS(n) \
// template class rocsparseBILU0<n>;
//
// INSTANTIATE_BDA_FUNCTIONS(1);
// INSTANTIATE_BDA_FUNCTIONS(2);
// INSTANTIATE_BDA_FUNCTIONS(3);
// INSTANTIATE_BDA_FUNCTIONS(4);
// INSTANTIATE_BDA_FUNCTIONS(5);
// INSTANTIATE_BDA_FUNCTIONS(6);
//
// #undef INSTANTIATE_BDA_FUNCTIONS
//
// } // namespace Accelerator
} // namespace Opm

View File

@@ -0,0 +1,361 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/TimingMacros.hpp>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <dune/common/shared_ptr.hh>
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
#include <opm/simulators/linalg/PropertyTree.hpp>
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
#include <opm/simulators/linalg/bda/Misc.hpp>
#include <hip/hip_runtime.h>
#include <hip/hip_version.h>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCSPARSE_CHECK(STAT) \
do { \
const rocsparse_status stat = (STAT); \
if(stat != rocsparse_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocsparse "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#define ROCBLAS_CHECK(STAT) \
do { \
const rocblas_status stat = (STAT); \
if(stat != rocblas_status_success) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::rocblas "; \
oss << "error: " << stat; \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <class Scalar, unsigned int block_size>
rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :
rocsparsePreconditioner<Scalar, block_size>(verbosity_)
{
bilu0 = std::make_unique<rocsparseBILU0<Scalar, block_size> >(verbosity_);
}
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
this->Nb = matrix->Nb;
this->nnzb = matrix->nnzbs;
this->N = Nb * block_size;
this->nnz = nnzb * block_size * block_size;
bilu0->set_context(this->handle, this->dir, this->operation, this->stream);
bilu0->setJacMat(*this->jacMat.get());
bilu0->initialize(matrix,jacMatrix,d_Arows,d_Acols);
return true;
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
copy_system_to_gpu(Scalar *b) {
bilu0->copy_system_to_gpu(b);
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
update_system_on_gpu(Scalar *vals) {
bilu0->update_system_on_gpu(vals);
}
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::
analyze_matrix(BlockedMatrix<Scalar> *mat_) {
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_);
this->mat = mat_;
return success;
}
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::
analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_, jacMat);
this->mat = mat_;
return success;
}
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::create_preconditioner(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_, jacMat);
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
this->create_preconditioner_amg(this->mat);
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
}
auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
std::call_once(rocm_buffers_allocated, init_func);
// upload matrices and vectors to GPU
rocm_upload();
return result;
}
template <class Scalar, unsigned int block_size>
bool rocsparseCPR<Scalar, block_size>::
create_preconditioner(BlockedMatrix<Scalar> *mat_) {
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_);
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
this->create_preconditioner_amg(this->mat);
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
}
auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
std::call_once(rocm_buffers_allocated, init_func);
// upload matrices and vectors to GPU
rocm_upload();
return result;
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
init_rocm_buffers() {
d_Amatrices.reserve(this->num_levels);
d_Rmatrices.reserve(this->num_levels - 1);
d_invDiags.reserve(this->num_levels - 1);
for (Matrix<Scalar>& m : this->Amatrices) {
d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);//TOCHECK: this memory might not be contigous!!
}
for (Matrix<Scalar>& m : this->Rmatrices) {
d_Rmatrices.emplace_back(m.N, m.M, m.nnzs, 1);
d_f.emplace_back(m.N);
d_u.emplace_back(m.N);
d_PcolIndices.emplace_back(m.M);
d_invDiags.emplace_back(m.M);
d_t.emplace_back(m.M);
}
d_weights.emplace_back(this->N);
d_rs.emplace_back(this->N);
d_mat = std::make_unique<RocmMatrix<Scalar>>(this->Nb, this->Nb, this->nnzb, block_size);
d_coarse_y.emplace_back(this->Nb);
d_coarse_x.emplace_back(this->Nb);
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
rocm_upload() {
d_mat->upload(this->mat, this->stream);
HIP_CHECK(hipMemcpyAsync(d_weights.data()->nnzValues, this->weights.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, this->stream));
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
d_Amatrices[i].upload(&this->Amatrices[i], this->stream);
HIP_CHECK(hipMemcpyAsync(d_invDiags[i].nnzValues, this->invDiags[i].data(), sizeof(Scalar) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_PcolIndices[i].nnzValues, this->PcolIndices[i].data(), sizeof(int) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
}
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
d_Rmatrices[i].upload(&this->Rmatrices[i], this->stream);
}
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
RocmMatrix<Scalar> *A = &d_Amatrices[level];
RocmMatrix<Scalar> *R = &d_Rmatrices[level];
int Ncur = A->Nb;
Scalar zero = 0.0;
Scalar one = 1.0;
rocsparse_mat_info spmv_info;
rocsparse_mat_descr descr_R;
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_R));
if (level == this->num_levels - 1) {
// solve coarsest level
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
HIP_CHECK(hipMemcpyAsync(h_y.data(), &y, sizeof(Scalar) * Ncur, hipMemcpyDeviceToHost, this->stream));
// solve coarsest level using umfpack
this->umfpack.apply(h_x.data(), h_y.data());
HIP_CHECK(hipMemcpyAsync(&x, h_x.data(), sizeof(Scalar) * Ncur, hipMemcpyHostToDevice, this->stream));
return;
}
int Nnext = d_Amatrices[level+1].Nb;
RocmVector<Scalar>& t = d_t[level];
RocmVector<Scalar>& f = d_f[level];
RocmVector<Scalar>& u = d_u[level]; // u was 0-initialized earlier
// presmooth
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
}
// move to coarser level
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
// TODO: understand why rocsparse spmv library function does not here.
// ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
// R->Nb, R->Mb, R->nnzbs, &one, descr_R,
// R->nnzValues, R->rowPointers, R->colIndices, 1,
// t.nnzValues, &zero, f.nnzValues));
HipKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
HipKernels::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
// postsmooth
for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
}
}
// x = prec(y)
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
apply_amg(const Scalar& y, Scalar& x) {
HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
for (unsigned int i = 0; i < d_u.size(); ++i) {
d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
}
HipKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
HipKernels::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));
HipKernels::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
}
template <class Scalar, unsigned int block_size>
void rocsparseCPR<Scalar, block_size>::
apply(Scalar& y, Scalar& x) {
Dune::Timer t_bilu0;
bilu0->apply(y, x);
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(this->stream));
std::ostringstream out;
out << "rocsparseCPR apply bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
apply_amg(y, x);
if (verbosity >= 3) {
std::ostringstream out;
out << "rocsparseCPR apply amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
}
}
#define INSTANCE_TYPE(T) \
template class rocsparseCPR<T,1>; \
template class rocsparseCPR<T,2>; \
template class rocsparseCPR<T,3>; \
template class rocsparseCPR<T,4>; \
template class rocsparseCPR<T,5>; \
template class rocsparseCPR<T,6>;
INSTANCE_TYPE(double)
} // namespace Opm

View File

@@ -0,0 +1,112 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ROCSPARSECPR_HPP
#define OPM_ROCSPARSECPR_HPP
#include <mutex>
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/CprCreation.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
namespace Opm::Accelerator {
template<class Scalar> class BlockedMatrix;
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
template <class Scalar, unsigned int block_size>
class rocsparseCPR : public rocsparsePreconditioner<Scalar, block_size>, public CprCreation<Scalar, block_size>
{
typedef rocsparsePreconditioner<Scalar, block_size> Base;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
private:
std::vector<RocmMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<RocmVector<int>> d_PcolIndices; // prolongation does not need a full matrix, only store colIndices
std::vector<RocmVector<Scalar>> d_invDiags; // inverse of diagonal of Amatrices
std::vector<RocmVector<Scalar>> d_t, d_f; // intermediate vectors used during amg cycle
std::vector<RocmVector<Scalar>> d_u; // intermediate vectors used during amg cycle
std::vector<RocmVector<Scalar>> d_rs; // use before extracting the pressure
std::vector<RocmVector<Scalar>> d_weights; // the quasiimpes weights, used to extract pressure
std::unique_ptr<RocmMatrix<Scalar>> d_mat; // stores blocked matrix
std::vector<RocmVector<Scalar>> d_coarse_y, d_coarse_x; // stores the scalar vectors
std::once_flag rocm_buffers_allocated; // only allocate OpenCL Buffers once
std::unique_ptr<rocsparseBILU0<Scalar, block_size> > bilu0; // Blocked ILU0 preconditioner
std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar
// Initialize and allocate matrices and vectors
void init_rocm_buffers();//TODO: rename to rocm/c and update in code
// Copy matrices and vectors to GPU
void rocm_upload();//TODO: rename to rocm/c and update in code
// apply pressure correction to vector
void apply_amg(const Scalar& y, Scalar& x);
void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
public:
rocsparseCPR(int verbosity);
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
void copy_system_to_gpu(Scalar *b);
/// Reassign pointers, in case the addresses of the Dune variables have changed
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vector b, contains N values
// void update_system(Scalar *vals, Scalar *b);
/// Update linear system to GPU
/// \param[in] b input vector, contains N values
void update_system_on_gpu(Scalar *b);
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
bool create_preconditioner(BlockedMatrix<Scalar> *mat);
bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
#if HAVE_OPENCL
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) {}
#endif
// applies blocked ilu0
// also applies amg for pressure component
void apply(Scalar& y, Scalar& x);
};
} // namespace Opm
#endif

View File

@@ -0,0 +1,113 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <sstream>
#include <iostream>
// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
#define HIP_CHECK(STAT) \
do { \
const hipError_t stat = (STAT); \
if(stat != hipSuccess) \
{ \
std::ostringstream oss; \
oss << "rocsparseSolverBackend::hip "; \
oss << "error: " << hipGetErrorString(stat); \
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
namespace Opm::Accelerator {
template<class Scalar>
RocmMatrix<Scalar>::
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
: Nb(Nb_),
Mb(Mb_),
nnzbs(nnzbs_),
block_size(block_size_)
{
HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(double) * block_size * block_size * nnzbs));
HIP_CHECK(hipMalloc((void**)&colIndices, sizeof(int) * nnzbs));
HIP_CHECK(hipMalloc((void**)&rowPointers, sizeof(int) * (Nb + 1)));
}
template <class Scalar>
void RocmMatrix<Scalar>::
upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(rowPointers, rows, sizeof(int) * (Nb + 1), hipMemcpyHostToDevice, stream));
}
template <class Scalar>
void RocmMatrix<Scalar>::
upload(Matrix<Scalar> *matrix, hipStream_t stream) {
if (block_size != 1) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
}
upload(matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data(), stream);
}
template <class Scalar>
void RocmMatrix<Scalar>::
upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream) {
if (matrix->block_size != block_size) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
}
upload(matrix->nnzValues, matrix->colIndices, matrix->rowPointers, stream);
}
template <class Scalar>
RocmVector<Scalar>::RocmVector(int N)
: size(N)
{
HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * N));
}
template <class Scalar>
void RocmVector<Scalar>::
upload(Scalar *vals, hipStream_t stream) {
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));
}
#define INSTANCE_TYPE(T) \
template class RocmVector<T>;\
template class RocmMatrix<T>;
INSTANCE_TYPE(int);
INSTANCE_TYPE(double);
} // namespace Opm

View File

@@ -0,0 +1,65 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ROCMMATRIX_HEADER_INCLUDED
#define OPM_ROCMMATRIX_HEADER_INCLUDED
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
namespace Opm::Accelerator {
template<class Scalar> class Matrix;
template<class Scalar> class BlockedMatrix;
/// This struct resembles a csr matrix, only doubles are supported
/// The matrix data is stored in OpenCL Buffers
template<class Scalar>
class RocmMatrix {
public:
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);
void upload(Scalar *vals, int *cols, int *rows, hipStream_t stream);
void upload(Matrix<Scalar> *matrix, hipStream_t stream);
void upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream);
Scalar* nnzValues;
int* colIndices;
int* rowPointers;
int Nb, Mb;
int nnzbs;
unsigned int block_size;
};
template <typename Scalar>
class RocmVector {
public:
RocmVector(int N);
void upload(Scalar *vals, hipStream_t stream);
void upload(Matrix<Scalar> *matrix, hipStream_t stream);
Scalar* nnzValues;
int size;
};
} // namespace Opm
#endif

View File

@@ -138,11 +138,11 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
// if (use_cpr) {
// prec = rocsparsePreconditioner<block_size>::create(PreconditionerType::CPR, verbosity);
// } else {
if (use_cpr) {
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity);
} else {
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity);
// }
}
prec->set_context(handle, dir, operation, stream);
}