From 7a307fafa05d0a6bacafd5fbf16996059484a4ab Mon Sep 17 00:00:00 2001 From: Razvan Nane Date: Tue, 4 Jun 2024 17:13:13 +0200 Subject: [PATCH] add rocsparseCPR preconditioner --- CMakeLists_files.cmake | 6 + opm/simulators/linalg/bda/rocm/hipKernels.cpp | 466 ++++++++++++++++++ opm/simulators/linalg/bda/rocm/hipKernels.hpp | 57 +++ .../linalg/bda/rocm/rocsparseBILU0.cpp | 13 - .../linalg/bda/rocm/rocsparseCPR.cpp | 361 ++++++++++++++ .../linalg/bda/rocm/rocsparseCPR.hpp | 112 +++++ .../linalg/bda/rocm/rocsparseMatrix.cpp | 113 +++++ .../linalg/bda/rocm/rocsparseMatrix.hpp | 65 +++ .../bda/rocm/rocsparseSolverBackend.cpp | 8 +- 9 files changed, 1184 insertions(+), 17 deletions(-) create mode 100644 opm/simulators/linalg/bda/rocm/hipKernels.cpp create mode 100644 opm/simulators/linalg/bda/rocm/hipKernels.hpp create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 0b4194b1f..8d7aa7cee 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -266,10 +266,13 @@ if(USE_BDA_BRIDGE) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp) endif() if(rocsparse_FOUND AND rocblas_FOUND) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/hipKernels.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp) endif() if(CUDA_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu) @@ -673,11 +676,14 @@ if (USE_BDA_BRIDGE) opm/simulators/linalg/bda/opencl/openclWellContributions.hpp opm/simulators/linalg/bda/Matrix.hpp opm/simulators/linalg/bda/MultisegmentWellContribution.hpp + opm/simulators/linalg/bda/rocm/hipKernels.hpp opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp + opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp + opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp opm/simulators/linalg/bda/WellContributions.hpp opm/simulators/linalg/ISTLSolverBda.hpp ) diff --git a/opm/simulators/linalg/bda/rocm/hipKernels.cpp b/opm/simulators/linalg/bda/rocm/hipKernels.cpp new file mode 100644 index 000000000..c4a0ade80 --- /dev/null +++ b/opm/simulators/linalg/bda/rocm/hipKernels.cpp @@ -0,0 +1,466 @@ +/* + Copyright 2024 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include +#include + +#define HIP_CHECK(STAT) \ + do { \ + const hipError_t stat = (STAT); \ + if(stat != hipSuccess) \ + { \ + std::ostringstream oss; \ + oss << "rocsparseSolverBackend::hip "; \ + oss << "error: " << hipGetErrorString(stat); \ + OPM_THROW(std::logic_error, oss.str()); \ + } \ + } while(0) + +namespace Opm +{ + +using Opm::OpmLog; +using Dune::Timer; + +// define static variables and kernels +int HipKernels::verbosity; +double* HipKernels::tmp; +bool HipKernels::initialized = false; +std::size_t HipKernels::preferred_workgroup_size_multiple = 0; + +#ifdef __HIP__ +/// HIP kernel to multiply vector with another vector and a scalar, element-wise +// add result to a third vector +__global__ void vmul_k( + const double alpha, + double const *in1, + double const *in2, + double *out, + const int N) +{ + unsigned int NUM_THREADS = gridDim.x; + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + while(idx < N){ + out[idx] += alpha * in1[idx] * in2[idx]; + idx += NUM_THREADS; + } +} + +/// HIP kernel to transform blocked vector to scalar vector using pressure-weights +// every workitem handles one blockrow +__global__ void full_to_pressure_restriction_k( + const double *fine_y, + const double *weights, + double *coarse_y, + const unsigned int Nb) +{ + const unsigned int NUM_THREADS = gridDim.x; + const unsigned int block_size = 3; + unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x; + + while(target_block_row < Nb){ + double sum = 0.0; + unsigned int idx = block_size * target_block_row; + for (unsigned int i = 0; i < block_size; ++i) { + sum += fine_y[idx + i] * weights[idx + i]; + } + coarse_y[target_block_row] = sum; + target_block_row += NUM_THREADS; + } +} + +/// HIP kernel to add the coarse pressure solution back to the finer, complete solution +// every workitem handles one blockrow +__global__ void add_coarse_pressure_correction_k( + const double *coarse_x, + double *fine_x, + const unsigned int pressure_idx, + const unsigned int Nb) +{ + const unsigned int NUM_THREADS = gridDim.x; + const unsigned int block_size = 3; + unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x; + + while(target_block_row < Nb){ + fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row]; + target_block_row += NUM_THREADS; + } +} + +/// HIP kernel to prolongate vector during amg cycle +// every workitem handles one row +__global__ void prolongate_vector_k( + const double *in, + double *out, + const int *cols, + const unsigned int N) +{ + const unsigned int NUM_THREADS = gridDim.x; + unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + + while(row < N){ + out[row] += in[cols[row]]; + row += NUM_THREADS; + } +} + +// res = rhs - mat * x +// algorithm based on: +// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +__global__ void residual_blocked_k( + const double *vals, + const int *cols, + const int *rows, + const int Nb, + const double *x, + const double *rhs, + double *out, + const unsigned int block_size + ) +{ + extern __shared__ double tmp[]; + const unsigned int warpsize = warpSize; + const unsigned int bsize = blockDim.x; + const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int idx_b = gid / bsize; + const unsigned int idx_t = threadIdx.x; + unsigned int idx = idx_b * bsize + idx_t; + const unsigned int bs = block_size; + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = gridDim.x; + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int target_block_row = idx / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + // for 3x3 blocks: + // num_active_threads: 27 (CUDA) vs 63 (ROCM) + // num_blocks_per_warp: 3 (CUDA) vs 7 (ROCM) + int offsetTarget = warpsize == 64 ? 48 : 32; + + while(target_block_row < Nb){ + unsigned int first_block = rows[target_block_row]; + unsigned int last_block = rows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + for(; block < last_block; block += num_blocks_per_warp){ + double x_elem = x[cols[block]*bs + c]; + double A_elem = vals[block*bs*bs + c + r*bs]; + local_out += x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + + for(unsigned int offset = 3; offset <= offsetTarget; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + __syncthreads(); + } + + if(lane < bs){ + unsigned int row = target_block_row*bs + lane; + out[row] = rhs[row] - tmp[lane]; + } + target_block_row += num_warps_in_grid; + } +} + +// KERNEL residual_k +// res = rhs - mat * x +// algorithm based on: +// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +// template +__global__ void residual_k( + const double *vals, + const int *cols, + const int *rows, + const int N, + const double *x, + const double *rhs, + double *out + ) +{ + extern __shared__ double tmp[]; + const unsigned int bsize = blockDim.x; + const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int idx_b = gid / bsize; + const unsigned int idx_t = threadIdx.x; + const unsigned int num_workgroups = gridDim.x; + + int row = idx_b; + + while (row < N) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + int rowLength = rowEnd - rowStart; + double local_sum = 0.0; + for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { + int col = cols[j]; + local_sum += vals[j] * x[col]; + } + + tmp[idx_t] = local_sum; + __syncthreads(); + + int offset = bsize / 2; + while(offset > 0) { + if (idx_t < offset) { + tmp[idx_t] += tmp[idx_t + offset]; + } + __syncthreads(); + offset = offset / 2; + } + + if (idx_t == 0) { + out[row] = rhs[row] - tmp[idx_t]; + } + + row += num_workgroups; + } +} + +__global__ void spmv_k( + const double *vals, + const int *cols, + const int *rows, + const int N, + const double *x, + double *out + ) +{ + extern __shared__ double tmp[]; + const unsigned int bsize = blockDim.x; + const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int idx_b = gid / bsize; + const unsigned int idx_t = threadIdx.x; + const unsigned int num_workgroups = gridDim.x; + + int row = idx_b; + + while (row < N) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + int rowLength = rowEnd - rowStart; + double local_sum = 0.0; + for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { + int col = cols[j]; + local_sum += vals[j] * x[col]; + } + + tmp[idx_t] = local_sum; + __syncthreads(); + + int offset = bsize / 2; + while(offset > 0) { + if (idx_t < offset) { + tmp[idx_t] += tmp[idx_t + offset]; + } + __syncthreads(); + offset = offset / 2; + } + + if (idx_t == 0) { + out[row] = tmp[idx_t]; + } + + row += num_workgroups; + } +} +#endif + +void HipKernels::init(int verbosity_) +{ + if (initialized) { + OpmLog::debug("Warning HipKernels is already initialized"); + return; + } + + verbosity = verbosity_; + + initialized = true; +} + +void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream) +{ + Timer t_vmul; +#ifdef __HIP__ + unsigned blockDim = 64; + unsigned blocks = Accelerator::ceilDivision(N, blockDim); + unsigned gridDim = blocks * blockDim; + + // dim3(N) will create a vector {N, 1, 1} + vmul_k<<>>(alpha, in1, in2, out, N); + HIP_CHECK(hipStreamSynchronize(stream)); +#else + OPM_THROW(std::logic_error, "Error vmul for rocsparse only supported when compiling with hipcc"); +#endif + + if (verbosity >= 4) { + std::ostringstream oss; + oss << std::scientific << "HipKernels vmul() time: " << t_vmul.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream) +{ + Timer t; +#ifdef __HIP__ + unsigned blockDim = 64; + unsigned blocks = Accelerator::ceilDivision(Nb, blockDim); + unsigned gridDim = blocks * blockDim; + + // dim3(N) will create a vector {N, 1, 1} + full_to_pressure_restriction_k<<>>(fine_y, weights, coarse_y, Nb); + HIP_CHECK(hipStreamSynchronize(stream)); +#else + OPM_THROW(std::logic_error, "Error full_to_pressure_restriction for rocsparse only supported when compiling with hipcc"); +#endif + + if (verbosity >= 4) { + std::ostringstream oss; + oss << std::scientific << "HipKernels full_to_pressure_restriction() time: " << t.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream) +{ + Timer t; +#ifdef __HIP__ + unsigned blockDim = 64; + unsigned blocks = Accelerator::ceilDivision(Nb, blockDim); + unsigned gridDim = blocks * blockDim; + + // dim3(N) will create a vector {N, 1, 1} + add_coarse_pressure_correction_k<<>>(coarse_x, fine_x, pressure_idx, Nb); + HIP_CHECK(hipStreamSynchronize(stream)); +#else + OPM_THROW(std::logic_error, "Error add_coarse_pressure_correction for rocsparse only supported when compiling with hipcc"); +#endif + + if (verbosity >= 4) { + std::ostringstream oss; + oss << std::scientific << "HipKernels add_coarse_pressure_correction() time: " << t.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void HipKernels::prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream) +{ + Timer t; + +#ifdef __HIP__ + unsigned blockDim = 64; + unsigned blocks = Accelerator::ceilDivision(N, blockDim); + unsigned gridDim = blocks * blockDim; + unsigned shared_mem_size = blockDim * sizeof(double); + + // dim3(N) will create a vector {N, 1, 1} + prolongate_vector_k<<>>(in, out, cols, N); + HIP_CHECK(hipStreamSynchronize(stream)); +#else + OPM_THROW(std::logic_error, "Error prolongate_vector for rocsparse only supported when compiling with hipcc"); +#endif + + if (verbosity >= 4) { + std::ostringstream oss; + oss << std::scientific << "HipKernels prolongate_vector() time: " << t.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream) +{ + Timer t_residual; + +#ifdef __HIP__ + unsigned blockDim = 64; + const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim); + unsigned gridDim = num_work_groups * blockDim; + unsigned shared_mem_size = blockDim * sizeof(double); + + if (block_size > 1) { + // dim3(N) will create a vector {N, 1, 1} + residual_blocked_k<<>>(vals, cols, rows, Nb, x, rhs, out, block_size); + } else { + residual_k<<>>(vals, cols, rows, Nb, x, rhs, out); + } + HIP_CHECK(hipStreamSynchronize(stream)); +#else + OPM_THROW(std::logic_error, "Error residual for rocsparse only supported when compiling with hipcc"); +#endif + + if (verbosity >= 4) { + HIP_CHECK(hipStreamSynchronize(stream)); + std::ostringstream oss; + oss << std::scientific << "HipKernels residual() time: " << t_residual.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream) +{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse! + Timer t_spmv; +#ifdef __HIP__ + unsigned blockDim = 64; + const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim); + unsigned gridDim = num_work_groups * blockDim; + unsigned shared_mem_size = blockDim * sizeof(double); + + spmv_k<<>>(vals, cols, rows, Nb, x, y); + + HIP_CHECK(hipStreamSynchronize(stream)); +#else + OPM_THROW(std::logic_error, "Error spmv for rocsparse only supported when compiling with hipcc"); +#endif + + if (verbosity >= 4) { + std::ostringstream oss; + oss << std::scientific << "HipKernels spmv_blocked() time: " << t_spmv.stop() << " s"; + OpmLog::info(oss.str()); + } +} + + +} // namespace Opm diff --git a/opm/simulators/linalg/bda/rocm/hipKernels.hpp b/opm/simulators/linalg/bda/rocm/hipKernels.hpp new file mode 100644 index 000000000..a70733679 --- /dev/null +++ b/opm/simulators/linalg/bda/rocm/hipKernels.hpp @@ -0,0 +1,57 @@ +/* + Copyright 2024 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef HIPKERNELS_HPP +#define HIPKERNELS_HPP + +#include +#include +#include + +#include +#include + +// #include + +namespace Opm +{ + +class HipKernels +{ +private: + static int verbosity; + static double* tmp; // used as tmp CPU buffer for dot() and norm() + static bool initialized; + static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE + + HipKernels(){}; // disable instantiation + +public: + static void init(int verbosity); + static void full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream); + static void add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream); + static void vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream); + static void prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream); + static void residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream); + static void spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream); +}; + +} // namespace Opm + +#endif diff --git a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp index 5092b747f..986ab3bba 100644 --- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp +++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp @@ -301,17 +301,4 @@ apply(double& y, double& x) { template class rocsparseBILU0; INSTANCE_TYPE(double) -// #define INSTANTIATE_BDA_FUNCTIONS(n) \ -// template class rocsparseBILU0; -// -// INSTANTIATE_BDA_FUNCTIONS(1); -// INSTANTIATE_BDA_FUNCTIONS(2); -// INSTANTIATE_BDA_FUNCTIONS(3); -// INSTANTIATE_BDA_FUNCTIONS(4); -// INSTANTIATE_BDA_FUNCTIONS(5); -// INSTANTIATE_BDA_FUNCTIONS(6); -// -// #undef INSTANTIATE_BDA_FUNCTIONS -// -// } // namespace Accelerator } // namespace Opm diff --git a/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp b/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp new file mode 100644 index 000000000..133847967 --- /dev/null +++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp @@ -0,0 +1,361 @@ +/* + Copyright 2021 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include + +#define HIP_CHECK(STAT) \ + do { \ + const hipError_t stat = (STAT); \ + if(stat != hipSuccess) \ + { \ + std::ostringstream oss; \ + oss << "rocsparseSolverBackend::hip "; \ + oss << "error: " << hipGetErrorString(stat); \ + OPM_THROW(std::logic_error, oss.str()); \ + } \ + } while(0) + +#define ROCSPARSE_CHECK(STAT) \ + do { \ + const rocsparse_status stat = (STAT); \ + if(stat != rocsparse_status_success) \ + { \ + std::ostringstream oss; \ + oss << "rocsparseSolverBackend::rocsparse "; \ + oss << "error: " << stat; \ + OPM_THROW(std::logic_error, oss.str()); \ + } \ + } while(0) + +#define ROCBLAS_CHECK(STAT) \ + do { \ + const rocblas_status stat = (STAT); \ + if(stat != rocblas_status_success) \ + { \ + std::ostringstream oss; \ + oss << "rocsparseSolverBackend::rocblas "; \ + oss << "error: " << stat; \ + OPM_THROW(std::logic_error, oss.str()); \ + } \ + } while(0) + +namespace Opm::Accelerator { + +using Opm::OpmLog; +using Dune::Timer; + +template +rocsparseCPR::rocsparseCPR(int verbosity_) : + rocsparsePreconditioner(verbosity_) +{ + bilu0 = std::make_unique >(verbosity_); +} + +template +bool rocsparseCPR:: +initialize(std::shared_ptr> matrix, std::shared_ptr> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) { + this->Nb = matrix->Nb; + this->nnzb = matrix->nnzbs; + this->N = Nb * block_size; + this->nnz = nnzb * block_size * block_size; + + bilu0->set_context(this->handle, this->dir, this->operation, this->stream); + bilu0->setJacMat(*this->jacMat.get()); + bilu0->initialize(matrix,jacMatrix,d_Arows,d_Acols); + return true; +} + +template +void rocsparseCPR:: +copy_system_to_gpu(Scalar *b) { + bilu0->copy_system_to_gpu(b); +} + +template +void rocsparseCPR:: +update_system_on_gpu(Scalar *vals) { + bilu0->update_system_on_gpu(vals); +} + +template +bool rocsparseCPR:: +analyze_matrix(BlockedMatrix *mat_) { + this->Nb = mat_->Nb; + this->nnzb = mat_->nnzbs; + this->N = Nb * block_size; + this->nnz = nnzb * block_size * block_size; + + bool success = bilu0->analyze_matrix(mat_); + + this->mat = mat_; + + return success; +} + +template +bool rocsparseCPR:: +analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) { + this->Nb = mat_->Nb; + this->nnzb = mat_->nnzbs; + this->N = Nb * block_size; + this->nnz = nnzb * block_size * block_size; + + bool success = bilu0->analyze_matrix(mat_, jacMat); + this->mat = mat_; + + return success; +} + +template +bool rocsparseCPR::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) { + Dune::Timer t_bilu0; + bool result = bilu0->create_preconditioner(mat_, jacMat); + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s"; + OpmLog::info(out.str()); + } + + Dune::Timer t_amg; + this->create_preconditioner_amg(this->mat); + + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s"; + OpmLog::info(out.str()); + } + + auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this); + std::call_once(rocm_buffers_allocated, init_func); + + // upload matrices and vectors to GPU + rocm_upload(); + + return result; +} + +template +bool rocsparseCPR:: +create_preconditioner(BlockedMatrix *mat_) { + Dune::Timer t_bilu0; + bool result = bilu0->create_preconditioner(mat_); + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s"; + OpmLog::info(out.str()); + } + + Dune::Timer t_amg; + this->create_preconditioner_amg(this->mat); + + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s"; + OpmLog::info(out.str()); + } + + auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this); + std::call_once(rocm_buffers_allocated, init_func); + + // upload matrices and vectors to GPU + rocm_upload(); + + return result; +} + +template +void rocsparseCPR:: +init_rocm_buffers() { + d_Amatrices.reserve(this->num_levels); + d_Rmatrices.reserve(this->num_levels - 1); + d_invDiags.reserve(this->num_levels - 1); + for (Matrix& m : this->Amatrices) { + d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);//TOCHECK: this memory might not be contigous!! + } + + for (Matrix& m : this->Rmatrices) { + d_Rmatrices.emplace_back(m.N, m.M, m.nnzs, 1); + d_f.emplace_back(m.N); + d_u.emplace_back(m.N); + d_PcolIndices.emplace_back(m.M); + d_invDiags.emplace_back(m.M); + d_t.emplace_back(m.M); + } + + d_weights.emplace_back(this->N); + d_rs.emplace_back(this->N); + d_mat = std::make_unique>(this->Nb, this->Nb, this->nnzb, block_size); + + d_coarse_y.emplace_back(this->Nb); + d_coarse_x.emplace_back(this->Nb); +} + +template +void rocsparseCPR:: +rocm_upload() { + d_mat->upload(this->mat, this->stream); + + HIP_CHECK(hipMemcpyAsync(d_weights.data()->nnzValues, this->weights.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, this->stream)); + + for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) { + d_Amatrices[i].upload(&this->Amatrices[i], this->stream); + + HIP_CHECK(hipMemcpyAsync(d_invDiags[i].nnzValues, this->invDiags[i].data(), sizeof(Scalar) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream)); + HIP_CHECK(hipMemcpyAsync(d_PcolIndices[i].nnzValues, this->PcolIndices[i].data(), sizeof(int) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream)); + } + + for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) { + d_Rmatrices[i].upload(&this->Rmatrices[i], this->stream); + } +} + +template +void rocsparseCPR:: +amg_cycle_gpu(const int level, Scalar &y, Scalar &x) { + RocmMatrix *A = &d_Amatrices[level]; + RocmMatrix *R = &d_Rmatrices[level]; + int Ncur = A->Nb; + Scalar zero = 0.0; + Scalar one = 1.0; + + rocsparse_mat_info spmv_info; + rocsparse_mat_descr descr_R; + ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info)); + ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_R)); + + if (level == this->num_levels - 1) { + // solve coarsest level + std::vector h_y(Ncur), h_x(Ncur, 0); + + HIP_CHECK(hipMemcpyAsync(h_y.data(), &y, sizeof(Scalar) * Ncur, hipMemcpyDeviceToHost, this->stream)); + + // solve coarsest level using umfpack + this->umfpack.apply(h_x.data(), h_y.data()); + + HIP_CHECK(hipMemcpyAsync(&x, h_x.data(), sizeof(Scalar) * Ncur, hipMemcpyHostToDevice, this->stream)); + + return; + } + + int Nnext = d_Amatrices[level+1].Nb; + + RocmVector& t = d_t[level]; + RocmVector& f = d_f[level]; + RocmVector& u = d_u[level]; // u was 0-initialized earlier + + // presmooth + Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72 + for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){ + HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream); + HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream); + } + + // move to coarser level + HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream); + +// TODO: understand why rocsparse spmv library function does not here. +// ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation, +// R->Nb, R->Mb, R->nnzbs, &one, descr_R, +// R->nnzValues, R->rowPointers, R->colIndices, 1, +// t.nnzValues, &zero, f.nnzValues)); + HipKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream); + + amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues); + HipKernels::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream); + + // postsmooth + for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){ + HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream); + HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream); + } +} + +// x = prec(y) +template +void rocsparseCPR:: +apply_amg(const Scalar& y, Scalar& x) { + HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream)); + + for (unsigned int i = 0; i < d_u.size(); ++i) { + d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream); + } + + HipKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream); + + HipKernels::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream); + + amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues)); + + HipKernels::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream); +} + +template +void rocsparseCPR:: +apply(Scalar& y, Scalar& x) { + Dune::Timer t_bilu0; + + bilu0->apply(y, x); + + if (verbosity >= 3) { + HIP_CHECK(hipStreamSynchronize(this->stream)); + std::ostringstream out; + out << "rocsparseCPR apply bilu0(): " << t_bilu0.stop() << " s"; + OpmLog::info(out.str()); + } + + Dune::Timer t_amg; + apply_amg(y, x); + if (verbosity >= 3) { + std::ostringstream out; + out << "rocsparseCPR apply amg(): " << t_amg.stop() << " s"; + OpmLog::info(out.str()); + } +} + +#define INSTANCE_TYPE(T) \ + template class rocsparseCPR; \ + template class rocsparseCPR; \ + template class rocsparseCPR; \ + template class rocsparseCPR; \ + template class rocsparseCPR; \ + template class rocsparseCPR; + +INSTANCE_TYPE(double) +} // namespace Opm + + diff --git a/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp b/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp new file mode 100644 index 000000000..9dad1671a --- /dev/null +++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp @@ -0,0 +1,112 @@ +/* + Copyright 2024 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_ROCSPARSECPR_HPP +#define OPM_ROCSPARSECPR_HPP + +#include + +#include +#include +#include +#include +#include + +#include + +namespace Opm::Accelerator { + +template class BlockedMatrix; + +/// This class implements a Constrained Pressure Residual (CPR) preconditioner +template +class rocsparseCPR : public rocsparsePreconditioner, public CprCreation +{ + typedef rocsparsePreconditioner Base; + + using Base::N; + using Base::Nb; + using Base::nnz; + using Base::nnzb; + using Base::verbosity; + +private: + std::vector> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy + + std::vector> d_PcolIndices; // prolongation does not need a full matrix, only store colIndices + std::vector> d_invDiags; // inverse of diagonal of Amatrices + std::vector> d_t, d_f; // intermediate vectors used during amg cycle + std::vector> d_u; // intermediate vectors used during amg cycle + std::vector> d_rs; // use before extracting the pressure + std::vector> d_weights; // the quasiimpes weights, used to extract pressure + std::unique_ptr> d_mat; // stores blocked matrix + std::vector> d_coarse_y, d_coarse_x; // stores the scalar vectors + std::once_flag rocm_buffers_allocated; // only allocate OpenCL Buffers once + + std::unique_ptr > bilu0; // Blocked ILU0 preconditioner + + std::unique_ptr > coarse_solver; // coarse solver is scalar + + // Initialize and allocate matrices and vectors + void init_rocm_buffers();//TODO: rename to rocm/c and update in code + + // Copy matrices and vectors to GPU + void rocm_upload();//TODO: rename to rocm/c and update in code + + // apply pressure correction to vector + void apply_amg(const Scalar& y, Scalar& x); + + void amg_cycle_gpu(const int level, Scalar &y, Scalar &x); + +public: + + rocsparseCPR(int verbosity); + + bool initialize(std::shared_ptr> matrix, std::shared_ptr> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols); + + void copy_system_to_gpu(Scalar *b); + + /// Reassign pointers, in case the addresses of the Dune variables have changed + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] b input vector b, contains N values +// void update_system(Scalar *vals, Scalar *b); + + /// Update linear system to GPU + /// \param[in] b input vector, contains N values + void update_system_on_gpu(Scalar *b); + + bool analyze_matrix(BlockedMatrix *mat); + bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat); + + bool create_preconditioner(BlockedMatrix *mat); + bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat); + +#if HAVE_OPENCL + // apply preconditioner, x = prec(y) + void apply(const cl::Buffer& y, cl::Buffer& x) {} +#endif + // applies blocked ilu0 + // also applies amg for pressure component + void apply(Scalar& y, Scalar& x); +}; + +} // namespace Opm + +#endif + diff --git a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp new file mode 100644 index 000000000..95966b422 --- /dev/null +++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp @@ -0,0 +1,113 @@ +/* + Copyright 2024 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +#include +#include + +#include +#include +#include + +#include +#include + +// #include + +#define HIP_CHECK(STAT) \ + do { \ + const hipError_t stat = (STAT); \ + if(stat != hipSuccess) \ + { \ + std::ostringstream oss; \ + oss << "rocsparseSolverBackend::hip "; \ + oss << "error: " << hipGetErrorString(stat); \ + OPM_THROW(std::logic_error, oss.str()); \ + } \ + } while(0) + +namespace Opm::Accelerator { + +template +RocmMatrix:: +RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_) + : Nb(Nb_), + Mb(Mb_), + nnzbs(nnzbs_), + block_size(block_size_) +{ + HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(double) * block_size * block_size * nnzbs)); + + HIP_CHECK(hipMalloc((void**)&colIndices, sizeof(int) * nnzbs)); + + HIP_CHECK(hipMalloc((void**)&rowPointers, sizeof(int) * (Nb + 1))); +} + +template +void RocmMatrix:: +upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) { + HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream)); + + HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream)); + + HIP_CHECK(hipMemcpyAsync(rowPointers, rows, sizeof(int) * (Nb + 1), hipMemcpyHostToDevice, stream)); +} + +template +void RocmMatrix:: +upload(Matrix *matrix, hipStream_t stream) { + if (block_size != 1) { + OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size"); + } + + upload(matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data(), stream); +} + +template +void RocmMatrix:: +upload(BlockedMatrix *matrix, hipStream_t stream) { + if (matrix->block_size != block_size) { + OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size"); + } + + upload(matrix->nnzValues, matrix->colIndices, matrix->rowPointers, stream); +} + +template +RocmVector::RocmVector(int N) + : size(N) +{ + HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * N)); +} + +template +void RocmVector:: +upload(Scalar *vals, hipStream_t stream) { + HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream)); +} + +#define INSTANCE_TYPE(T) \ +template class RocmVector;\ +template class RocmMatrix; + +INSTANCE_TYPE(int); +INSTANCE_TYPE(double); + +} // namespace Opm diff --git a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp new file mode 100644 index 000000000..53feb1fea --- /dev/null +++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp @@ -0,0 +1,65 @@ +/* + Copyright 2024 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_ROCMMATRIX_HEADER_INCLUDED +#define OPM_ROCMMATRIX_HEADER_INCLUDED + +#include +#include + +namespace Opm::Accelerator { + +template class Matrix; +template class BlockedMatrix; + +/// This struct resembles a csr matrix, only doubles are supported +/// The matrix data is stored in OpenCL Buffers +template +class RocmMatrix { +public: + + RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_); + + void upload(Scalar *vals, int *cols, int *rows, hipStream_t stream); + void upload(Matrix *matrix, hipStream_t stream); + void upload(BlockedMatrix *matrix, hipStream_t stream); + + Scalar* nnzValues; + int* colIndices; + int* rowPointers; + int Nb, Mb; + int nnzbs; + unsigned int block_size; +}; + +template +class RocmVector { +public: + + RocmVector(int N); + + void upload(Scalar *vals, hipStream_t stream); + void upload(Matrix *matrix, hipStream_t stream); + + Scalar* nnzValues; + int size; +}; +} // namespace Opm + +#endif diff --git a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp index 29bf0dc81..937e70e76 100644 --- a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp +++ b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp @@ -138,11 +138,11 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_, ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream)); using PreconditionerType = typename Opm::Accelerator::PreconditionerType; -// if (use_cpr) { -// prec = rocsparsePreconditioner::create(PreconditionerType::CPR, verbosity); -// } else { + if (use_cpr) { + prec = rocsparsePreconditioner::create(PreconditionerType::CPR, verbosity); + } else { prec = rocsparsePreconditioner::create(PreconditionerType::BILU0, verbosity); -// } + } prec->set_context(handle, dir, operation, stream); }