mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
add rocsparseCPR preconditioner
This commit is contained in:
@@ -266,10 +266,13 @@ if(USE_BDA_BRIDGE)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp)
|
||||
endif()
|
||||
if(rocsparse_FOUND AND rocblas_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/hipKernels.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp)
|
||||
endif()
|
||||
if(CUDA_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
|
||||
@@ -673,11 +676,14 @@ if (USE_BDA_BRIDGE)
|
||||
opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
|
||||
opm/simulators/linalg/bda/Matrix.hpp
|
||||
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
|
||||
opm/simulators/linalg/bda/rocm/hipKernels.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
|
||||
opm/simulators/linalg/bda/WellContributions.hpp
|
||||
opm/simulators/linalg/ISTLSolverBda.hpp
|
||||
)
|
||||
|
||||
466
opm/simulators/linalg/bda/rocm/hipKernels.cpp
Normal file
466
opm/simulators/linalg/bda/rocm/hipKernels.cpp
Normal file
@@ -0,0 +1,466 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
#define HIP_CHECK(STAT) \
|
||||
do { \
|
||||
const hipError_t stat = (STAT); \
|
||||
if(stat != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::hip "; \
|
||||
oss << "error: " << hipGetErrorString(stat); \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
// define static variables and kernels
|
||||
int HipKernels::verbosity;
|
||||
double* HipKernels::tmp;
|
||||
bool HipKernels::initialized = false;
|
||||
std::size_t HipKernels::preferred_workgroup_size_multiple = 0;
|
||||
|
||||
#ifdef __HIP__
|
||||
/// HIP kernel to multiply vector with another vector and a scalar, element-wise
|
||||
// add result to a third vector
|
||||
__global__ void vmul_k(
|
||||
const double alpha,
|
||||
double const *in1,
|
||||
double const *in2,
|
||||
double *out,
|
||||
const int N)
|
||||
{
|
||||
unsigned int NUM_THREADS = gridDim.x;
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(idx < N){
|
||||
out[idx] += alpha * in1[idx] * in2[idx];
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
/// HIP kernel to transform blocked vector to scalar vector using pressure-weights
|
||||
// every workitem handles one blockrow
|
||||
__global__ void full_to_pressure_restriction_k(
|
||||
const double *fine_y,
|
||||
const double *weights,
|
||||
double *coarse_y,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(target_block_row < Nb){
|
||||
double sum = 0.0;
|
||||
unsigned int idx = block_size * target_block_row;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
sum += fine_y[idx + i] * weights[idx + i];
|
||||
}
|
||||
coarse_y[target_block_row] = sum;
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
/// HIP kernel to add the coarse pressure solution back to the finer, complete solution
|
||||
// every workitem handles one blockrow
|
||||
__global__ void add_coarse_pressure_correction_k(
|
||||
const double *coarse_x,
|
||||
double *fine_x,
|
||||
const unsigned int pressure_idx,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(target_block_row < Nb){
|
||||
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
/// HIP kernel to prolongate vector during amg cycle
|
||||
// every workitem handles one row
|
||||
__global__ void prolongate_vector_k(
|
||||
const double *in,
|
||||
double *out,
|
||||
const int *cols,
|
||||
const unsigned int N)
|
||||
{
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(row < N){
|
||||
out[row] += in[cols[row]];
|
||||
row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
// res = rhs - mat * x
|
||||
// algorithm based on:
|
||||
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
__global__ void residual_blocked_k(
|
||||
const double *vals,
|
||||
const int *cols,
|
||||
const int *rows,
|
||||
const int Nb,
|
||||
const double *x,
|
||||
const double *rhs,
|
||||
double *out,
|
||||
const unsigned int block_size
|
||||
)
|
||||
{
|
||||
extern __shared__ double tmp[];
|
||||
const unsigned int warpsize = warpSize;
|
||||
const unsigned int bsize = blockDim.x;
|
||||
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const unsigned int idx_b = gid / bsize;
|
||||
const unsigned int idx_t = threadIdx.x;
|
||||
unsigned int idx = idx_b * bsize + idx_t;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
// for 3x3 blocks:
|
||||
// num_active_threads: 27 (CUDA) vs 63 (ROCM)
|
||||
// num_blocks_per_warp: 3 (CUDA) vs 7 (ROCM)
|
||||
int offsetTarget = warpsize == 64 ? 48 : 32;
|
||||
|
||||
while(target_block_row < Nb){
|
||||
unsigned int first_block = rows[target_block_row];
|
||||
unsigned int last_block = rows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
double x_elem = x[cols[block]*bs + c];
|
||||
double A_elem = vals[block*bs*bs + c + r*bs];
|
||||
local_out += x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
|
||||
for(unsigned int offset = 3; offset <= offsetTarget; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
unsigned int row = target_block_row*bs + lane;
|
||||
out[row] = rhs[row] - tmp[lane];
|
||||
}
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
|
||||
// KERNEL residual_k
|
||||
// res = rhs - mat * x
|
||||
// algorithm based on:
|
||||
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
// template <unsigned shared_mem_size>
|
||||
__global__ void residual_k(
|
||||
const double *vals,
|
||||
const int *cols,
|
||||
const int *rows,
|
||||
const int N,
|
||||
const double *x,
|
||||
const double *rhs,
|
||||
double *out
|
||||
)
|
||||
{
|
||||
extern __shared__ double tmp[];
|
||||
const unsigned int bsize = blockDim.x;
|
||||
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const unsigned int idx_b = gid / bsize;
|
||||
const unsigned int idx_t = threadIdx.x;
|
||||
const unsigned int num_workgroups = gridDim.x;
|
||||
|
||||
int row = idx_b;
|
||||
|
||||
while (row < N) {
|
||||
int rowStart = rows[row];
|
||||
int rowEnd = rows[row+1];
|
||||
int rowLength = rowEnd - rowStart;
|
||||
double local_sum = 0.0;
|
||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||
int col = cols[j];
|
||||
local_sum += vals[j] * x[col];
|
||||
}
|
||||
|
||||
tmp[idx_t] = local_sum;
|
||||
__syncthreads();
|
||||
|
||||
int offset = bsize / 2;
|
||||
while(offset > 0) {
|
||||
if (idx_t < offset) {
|
||||
tmp[idx_t] += tmp[idx_t + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
offset = offset / 2;
|
||||
}
|
||||
|
||||
if (idx_t == 0) {
|
||||
out[row] = rhs[row] - tmp[idx_t];
|
||||
}
|
||||
|
||||
row += num_workgroups;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void spmv_k(
|
||||
const double *vals,
|
||||
const int *cols,
|
||||
const int *rows,
|
||||
const int N,
|
||||
const double *x,
|
||||
double *out
|
||||
)
|
||||
{
|
||||
extern __shared__ double tmp[];
|
||||
const unsigned int bsize = blockDim.x;
|
||||
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const unsigned int idx_b = gid / bsize;
|
||||
const unsigned int idx_t = threadIdx.x;
|
||||
const unsigned int num_workgroups = gridDim.x;
|
||||
|
||||
int row = idx_b;
|
||||
|
||||
while (row < N) {
|
||||
int rowStart = rows[row];
|
||||
int rowEnd = rows[row+1];
|
||||
int rowLength = rowEnd - rowStart;
|
||||
double local_sum = 0.0;
|
||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||
int col = cols[j];
|
||||
local_sum += vals[j] * x[col];
|
||||
}
|
||||
|
||||
tmp[idx_t] = local_sum;
|
||||
__syncthreads();
|
||||
|
||||
int offset = bsize / 2;
|
||||
while(offset > 0) {
|
||||
if (idx_t < offset) {
|
||||
tmp[idx_t] += tmp[idx_t + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
offset = offset / 2;
|
||||
}
|
||||
|
||||
if (idx_t == 0) {
|
||||
out[row] = tmp[idx_t];
|
||||
}
|
||||
|
||||
row += num_workgroups;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void HipKernels::init(int verbosity_)
|
||||
{
|
||||
if (initialized) {
|
||||
OpmLog::debug("Warning HipKernels is already initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
verbosity = verbosity_;
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream)
|
||||
{
|
||||
Timer t_vmul;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
vmul_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(alpha, in1, in2, out, N);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error vmul for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels vmul() time: " << t_vmul.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream)
|
||||
{
|
||||
Timer t;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
full_to_pressure_restriction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(fine_y, weights, coarse_y, Nb);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error full_to_pressure_restriction for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels full_to_pressure_restriction() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream)
|
||||
{
|
||||
Timer t;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
add_coarse_pressure_correction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(coarse_x, fine_x, pressure_idx, Nb);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error add_coarse_pressure_correction for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void HipKernels::prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
unsigned shared_mem_size = blockDim * sizeof(double);
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error prolongate_vector for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels prolongate_vector() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream)
|
||||
{
|
||||
Timer t_residual;
|
||||
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = num_work_groups * blockDim;
|
||||
unsigned shared_mem_size = blockDim * sizeof(double);
|
||||
|
||||
if (block_size > 1) {
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
residual_blocked_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out, block_size);
|
||||
} else {
|
||||
residual_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out);
|
||||
}
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error residual for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels residual() time: " << t_residual.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream)
|
||||
{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
|
||||
Timer t_spmv;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = num_work_groups * blockDim;
|
||||
unsigned shared_mem_size = blockDim * sizeof(double);
|
||||
|
||||
spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error spmv for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels spmv_blocked() time: " << t_spmv.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Opm
|
||||
57
opm/simulators/linalg/bda/rocm/hipKernels.hpp
Normal file
57
opm/simulators/linalg/bda/rocm/hipKernels.hpp
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef HIPKERNELS_HPP
|
||||
#define HIPKERNELS_HPP
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <cstddef>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
class HipKernels
|
||||
{
|
||||
private:
|
||||
static int verbosity;
|
||||
static double* tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static bool initialized;
|
||||
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
|
||||
|
||||
HipKernels(){}; // disable instantiation
|
||||
|
||||
public:
|
||||
static void init(int verbosity);
|
||||
static void full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream);
|
||||
static void add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream);
|
||||
static void vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream);
|
||||
static void prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream);
|
||||
static void residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream);
|
||||
static void spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream);
|
||||
};
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
@@ -301,17 +301,4 @@ apply(double& y, double& x) {
|
||||
template class rocsparseBILU0<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
// #define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
// template class rocsparseBILU0<n>;
|
||||
//
|
||||
// INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
// INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
// INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
// INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
// INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
// INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
//
|
||||
// #undef INSTANTIATE_BDA_FUNCTIONS
|
||||
//
|
||||
// } // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
361
opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
Normal file
361
opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
Normal file
@@ -0,0 +1,361 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
#define HIP_CHECK(STAT) \
|
||||
do { \
|
||||
const hipError_t stat = (STAT); \
|
||||
if(stat != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::hip "; \
|
||||
oss << "error: " << hipGetErrorString(stat); \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ROCSPARSE_CHECK(STAT) \
|
||||
do { \
|
||||
const rocsparse_status stat = (STAT); \
|
||||
if(stat != rocsparse_status_success) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::rocsparse "; \
|
||||
oss << "error: " << stat; \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ROCBLAS_CHECK(STAT) \
|
||||
do { \
|
||||
const rocblas_status stat = (STAT); \
|
||||
if(stat != rocblas_status_success) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::rocblas "; \
|
||||
oss << "error: " << stat; \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :
|
||||
rocsparsePreconditioner<Scalar, block_size>(verbosity_)
|
||||
{
|
||||
bilu0 = std::make_unique<rocsparseBILU0<Scalar, block_size> >(verbosity_);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
|
||||
this->Nb = matrix->Nb;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bilu0->set_context(this->handle, this->dir, this->operation, this->stream);
|
||||
bilu0->setJacMat(*this->jacMat.get());
|
||||
bilu0->initialize(matrix,jacMatrix,d_Arows,d_Acols);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
copy_system_to_gpu(Scalar *b) {
|
||||
bilu0->copy_system_to_gpu(b);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
update_system_on_gpu(Scalar *vals) {
|
||||
bilu0->update_system_on_gpu(vals);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar> *mat_) {
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_);
|
||||
|
||||
this->mat = mat_;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_, jacMat);
|
||||
this->mat = mat_;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::create_preconditioner(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_, jacMat);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
this->create_preconditioner_amg(this->mat);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
|
||||
std::call_once(rocm_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
rocm_upload();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar> *mat_) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
this->create_preconditioner_amg(this->mat);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
|
||||
std::call_once(rocm_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
rocm_upload();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
init_rocm_buffers() {
|
||||
d_Amatrices.reserve(this->num_levels);
|
||||
d_Rmatrices.reserve(this->num_levels - 1);
|
||||
d_invDiags.reserve(this->num_levels - 1);
|
||||
for (Matrix<Scalar>& m : this->Amatrices) {
|
||||
d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);//TOCHECK: this memory might not be contigous!!
|
||||
}
|
||||
|
||||
for (Matrix<Scalar>& m : this->Rmatrices) {
|
||||
d_Rmatrices.emplace_back(m.N, m.M, m.nnzs, 1);
|
||||
d_f.emplace_back(m.N);
|
||||
d_u.emplace_back(m.N);
|
||||
d_PcolIndices.emplace_back(m.M);
|
||||
d_invDiags.emplace_back(m.M);
|
||||
d_t.emplace_back(m.M);
|
||||
}
|
||||
|
||||
d_weights.emplace_back(this->N);
|
||||
d_rs.emplace_back(this->N);
|
||||
d_mat = std::make_unique<RocmMatrix<Scalar>>(this->Nb, this->Nb, this->nnzb, block_size);
|
||||
|
||||
d_coarse_y.emplace_back(this->Nb);
|
||||
d_coarse_x.emplace_back(this->Nb);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
rocm_upload() {
|
||||
d_mat->upload(this->mat, this->stream);
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_weights.data()->nnzValues, this->weights.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, this->stream));
|
||||
|
||||
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(&this->Amatrices[i], this->stream);
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_invDiags[i].nnzValues, this->invDiags[i].data(), sizeof(Scalar) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_PcolIndices[i].nnzValues, this->PcolIndices[i].data(), sizeof(int) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
|
||||
d_Rmatrices[i].upload(&this->Rmatrices[i], this->stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
|
||||
RocmMatrix<Scalar> *A = &d_Amatrices[level];
|
||||
RocmMatrix<Scalar> *R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
Scalar zero = 0.0;
|
||||
Scalar one = 1.0;
|
||||
|
||||
rocsparse_mat_info spmv_info;
|
||||
rocsparse_mat_descr descr_R;
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_R));
|
||||
|
||||
if (level == this->num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(h_y.data(), &y, sizeof(Scalar) * Ncur, hipMemcpyDeviceToHost, this->stream));
|
||||
|
||||
// solve coarsest level using umfpack
|
||||
this->umfpack.apply(h_x.data(), h_y.data());
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(&x, h_x.data(), sizeof(Scalar) * Ncur, hipMemcpyHostToDevice, this->stream));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int Nnext = d_Amatrices[level+1].Nb;
|
||||
|
||||
RocmVector<Scalar>& t = d_t[level];
|
||||
RocmVector<Scalar>& f = d_f[level];
|
||||
RocmVector<Scalar>& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
|
||||
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
|
||||
HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
|
||||
|
||||
// TODO: understand why rocsparse spmv library function does not here.
|
||||
// ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
|
||||
// R->Nb, R->Mb, R->nnzbs, &one, descr_R,
|
||||
// R->nnzValues, R->rowPointers, R->colIndices, 1,
|
||||
// t.nnzValues, &zero, f.nnzValues));
|
||||
HipKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
|
||||
|
||||
amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
|
||||
HipKernels::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
|
||||
HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
|
||||
HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
|
||||
}
|
||||
}
|
||||
|
||||
// x = prec(y)
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
apply_amg(const Scalar& y, Scalar& x) {
|
||||
HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
|
||||
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
|
||||
}
|
||||
|
||||
HipKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
|
||||
|
||||
HipKernels::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
|
||||
|
||||
amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));
|
||||
|
||||
HipKernels::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
apply(Scalar& y, Scalar& x) {
|
||||
Dune::Timer t_bilu0;
|
||||
|
||||
bilu0->apply(y, x);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR apply bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
apply_amg(y, x);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR apply amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class rocsparseCPR<T,1>; \
|
||||
template class rocsparseCPR<T,2>; \
|
||||
template class rocsparseCPR<T,3>; \
|
||||
template class rocsparseCPR<T,4>; \
|
||||
template class rocsparseCPR<T,5>; \
|
||||
template class rocsparseCPR<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
} // namespace Opm
|
||||
|
||||
|
||||
112
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
Normal file
112
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
Normal file
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_ROCSPARSECPR_HPP
|
||||
#define OPM_ROCSPARSECPR_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/CprCreation.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template <class Scalar, unsigned int block_size>
|
||||
class rocsparseCPR : public rocsparsePreconditioner<Scalar, block_size>, public CprCreation<Scalar, block_size>
|
||||
{
|
||||
typedef rocsparsePreconditioner<Scalar, block_size> Base;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
|
||||
private:
|
||||
std::vector<RocmMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
|
||||
std::vector<RocmVector<int>> d_PcolIndices; // prolongation does not need a full matrix, only store colIndices
|
||||
std::vector<RocmVector<Scalar>> d_invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<RocmVector<Scalar>> d_t, d_f; // intermediate vectors used during amg cycle
|
||||
std::vector<RocmVector<Scalar>> d_u; // intermediate vectors used during amg cycle
|
||||
std::vector<RocmVector<Scalar>> d_rs; // use before extracting the pressure
|
||||
std::vector<RocmVector<Scalar>> d_weights; // the quasiimpes weights, used to extract pressure
|
||||
std::unique_ptr<RocmMatrix<Scalar>> d_mat; // stores blocked matrix
|
||||
std::vector<RocmVector<Scalar>> d_coarse_y, d_coarse_x; // stores the scalar vectors
|
||||
std::once_flag rocm_buffers_allocated; // only allocate OpenCL Buffers once
|
||||
|
||||
std::unique_ptr<rocsparseBILU0<Scalar, block_size> > bilu0; // Blocked ILU0 preconditioner
|
||||
|
||||
std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar
|
||||
|
||||
// Initialize and allocate matrices and vectors
|
||||
void init_rocm_buffers();//TODO: rename to rocm/c and update in code
|
||||
|
||||
// Copy matrices and vectors to GPU
|
||||
void rocm_upload();//TODO: rename to rocm/c and update in code
|
||||
|
||||
// apply pressure correction to vector
|
||||
void apply_amg(const Scalar& y, Scalar& x);
|
||||
|
||||
void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
|
||||
|
||||
public:
|
||||
|
||||
rocsparseCPR(int verbosity);
|
||||
|
||||
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
|
||||
|
||||
void copy_system_to_gpu(Scalar *b);
|
||||
|
||||
/// Reassign pointers, in case the addresses of the Dune variables have changed
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vector b, contains N values
|
||||
// void update_system(Scalar *vals, Scalar *b);
|
||||
|
||||
/// Update linear system to GPU
|
||||
/// \param[in] b input vector, contains N values
|
||||
void update_system_on_gpu(Scalar *b);
|
||||
|
||||
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
|
||||
bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
|
||||
|
||||
bool create_preconditioner(BlockedMatrix<Scalar> *mat);
|
||||
bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
|
||||
|
||||
#if HAVE_OPENCL
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) {}
|
||||
#endif
|
||||
// applies blocked ilu0
|
||||
// also applies amg for pressure component
|
||||
void apply(Scalar& y, Scalar& x);
|
||||
};
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
||||
113
opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
Normal file
113
opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
|
||||
#define HIP_CHECK(STAT) \
|
||||
do { \
|
||||
const hipError_t stat = (STAT); \
|
||||
if(stat != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::hip "; \
|
||||
oss << "error: " << hipGetErrorString(stat); \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar>
|
||||
RocmMatrix<Scalar>::
|
||||
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
|
||||
: Nb(Nb_),
|
||||
Mb(Mb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_)
|
||||
{
|
||||
HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(double) * block_size * block_size * nnzbs));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&colIndices, sizeof(int) * nnzbs));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&rowPointers, sizeof(int) * (Nb + 1)));
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmMatrix<Scalar>::
|
||||
upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {
|
||||
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(rowPointers, rows, sizeof(int) * (Nb + 1), hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmMatrix<Scalar>::
|
||||
upload(Matrix<Scalar> *matrix, hipStream_t stream) {
|
||||
if (block_size != 1) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
|
||||
}
|
||||
|
||||
upload(matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data(), stream);
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmMatrix<Scalar>::
|
||||
upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream) {
|
||||
if (matrix->block_size != block_size) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
|
||||
}
|
||||
|
||||
upload(matrix->nnzValues, matrix->colIndices, matrix->rowPointers, stream);
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
RocmVector<Scalar>::RocmVector(int N)
|
||||
: size(N)
|
||||
{
|
||||
HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * N));
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmVector<Scalar>::
|
||||
upload(Scalar *vals, hipStream_t stream) {
|
||||
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class RocmVector<T>;\
|
||||
template class RocmMatrix<T>;
|
||||
|
||||
INSTANCE_TYPE(int);
|
||||
INSTANCE_TYPE(double);
|
||||
|
||||
} // namespace Opm
|
||||
65
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
Normal file
65
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_ROCMMATRIX_HEADER_INCLUDED
|
||||
#define OPM_ROCMMATRIX_HEADER_INCLUDED
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class Matrix;
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The matrix data is stored in OpenCL Buffers
|
||||
template<class Scalar>
|
||||
class RocmMatrix {
|
||||
public:
|
||||
|
||||
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);
|
||||
|
||||
void upload(Scalar *vals, int *cols, int *rows, hipStream_t stream);
|
||||
void upload(Matrix<Scalar> *matrix, hipStream_t stream);
|
||||
void upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream);
|
||||
|
||||
Scalar* nnzValues;
|
||||
int* colIndices;
|
||||
int* rowPointers;
|
||||
int Nb, Mb;
|
||||
int nnzbs;
|
||||
unsigned int block_size;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
class RocmVector {
|
||||
public:
|
||||
|
||||
RocmVector(int N);
|
||||
|
||||
void upload(Scalar *vals, hipStream_t stream);
|
||||
void upload(Matrix<Scalar> *matrix, hipStream_t stream);
|
||||
|
||||
Scalar* nnzValues;
|
||||
int size;
|
||||
};
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
@@ -138,11 +138,11 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
|
||||
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
|
||||
|
||||
using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
|
||||
// if (use_cpr) {
|
||||
// prec = rocsparsePreconditioner<block_size>::create(PreconditionerType::CPR, verbosity);
|
||||
// } else {
|
||||
if (use_cpr) {
|
||||
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity);
|
||||
} else {
|
||||
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity);
|
||||
// }
|
||||
}
|
||||
|
||||
prec->set_context(handle, dir, operation, stream);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user