From 7a307fafa05d0a6bacafd5fbf16996059484a4ab Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvan.nane@bigdataccelerate.com>
Date: Tue, 4 Jun 2024 17:13:13 +0200
Subject: [PATCH] add rocsparseCPR preconditioner

---
 CMakeLists_files.cmake                        |   6 +
 opm/simulators/linalg/bda/rocm/hipKernels.cpp | 466 ++++++++++++++++++
 opm/simulators/linalg/bda/rocm/hipKernels.hpp |  57 +++
 .../linalg/bda/rocm/rocsparseBILU0.cpp        |  13 -
 .../linalg/bda/rocm/rocsparseCPR.cpp          | 361 ++++++++++++++
 .../linalg/bda/rocm/rocsparseCPR.hpp          | 112 +++++
 .../linalg/bda/rocm/rocsparseMatrix.cpp       | 113 +++++
 .../linalg/bda/rocm/rocsparseMatrix.hpp       |  65 +++
 .../bda/rocm/rocsparseSolverBackend.cpp       |   8 +-
 9 files changed, 1184 insertions(+), 17 deletions(-)
 create mode 100644 opm/simulators/linalg/bda/rocm/hipKernels.cpp
 create mode 100644 opm/simulators/linalg/bda/rocm/hipKernels.hpp
 create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
 create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
 create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
 create mode 100644 opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 0b4194b1f..8d7aa7cee 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -266,10 +266,13 @@ if(USE_BDA_BRIDGE)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp)
   endif()
   if(rocsparse_FOUND AND rocblas_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/hipKernels.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp)
   endif()
   if(CUDA_FOUND)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
@@ -673,11 +676,14 @@ if (USE_BDA_BRIDGE)
     opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
     opm/simulators/linalg/bda/Matrix.hpp
     opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+    opm/simulators/linalg/bda/rocm/hipKernels.hpp
     opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
     opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
     opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
     opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
     opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
     opm/simulators/linalg/bda/WellContributions.hpp
     opm/simulators/linalg/ISTLSolverBda.hpp
   )
diff --git a/opm/simulators/linalg/bda/rocm/hipKernels.cpp b/opm/simulators/linalg/bda/rocm/hipKernels.cpp
new file mode 100644
index 000000000..c4a0ade80
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocm/hipKernels.cpp
@@ -0,0 +1,466 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <cmath>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_version.h>
+
+#define HIP_CHECK(STAT)                                  \
+    do {                                                 \
+        const hipError_t stat = (STAT);                  \
+        if(stat != hipSuccess)                           \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::hip ";       \
+            oss << "error: " << hipGetErrorString(stat); \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+namespace Opm
+{
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+// define static variables and kernels
+int HipKernels::verbosity;
+double* HipKernels::tmp;
+bool HipKernels::initialized = false;
+std::size_t HipKernels::preferred_workgroup_size_multiple = 0;
+
+#ifdef __HIP__
+/// HIP kernel to multiply vector with another vector and a scalar, element-wise
+// add result to a third vector
+__global__ void vmul_k(
+    const double alpha,
+    double const *in1,
+    double const *in2,
+    double *out,
+    const int N)
+{
+    unsigned int NUM_THREADS = gridDim.x;
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(idx < N){
+        out[idx] += alpha * in1[idx] * in2[idx];
+        idx += NUM_THREADS;
+    }
+}
+
+/// HIP kernel to transform blocked vector to scalar vector using pressure-weights
+// every workitem handles one blockrow
+__global__ void full_to_pressure_restriction_k(
+    const double *fine_y,
+    const double *weights,
+    double *coarse_y,
+    const unsigned int Nb)
+{
+    const unsigned int NUM_THREADS = gridDim.x;
+    const unsigned int block_size = 3;
+    unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(target_block_row < Nb){
+        double sum = 0.0;
+        unsigned int idx = block_size * target_block_row;
+        for (unsigned int i = 0; i < block_size; ++i) {
+            sum += fine_y[idx + i] * weights[idx + i];
+        }
+        coarse_y[target_block_row] = sum;
+        target_block_row += NUM_THREADS;
+    }
+}
+
+/// HIP kernel to add the coarse pressure solution back to the finer, complete solution
+// every workitem handles one blockrow
+__global__ void add_coarse_pressure_correction_k(
+    const double *coarse_x,
+    double *fine_x,
+    const unsigned int pressure_idx,
+    const unsigned int Nb)
+{
+    const unsigned int NUM_THREADS = gridDim.x;
+    const unsigned int block_size = 3;
+    unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(target_block_row < Nb){
+        fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
+        target_block_row += NUM_THREADS;
+    }
+}
+
+/// HIP kernel to prolongate vector during amg cycle
+// every workitem handles one row
+__global__ void prolongate_vector_k(
+     const double *in,
+     double *out,
+     const int *cols,
+     const unsigned int N)
+{
+    const unsigned int NUM_THREADS = gridDim.x;
+    unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(row < N){
+        out[row] += in[cols[row]];
+        row += NUM_THREADS;
+    }
+}
+
+// res = rhs - mat * x
+// algorithm based on:
+// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__global__ void residual_blocked_k(
+    const double *vals,
+    const int *cols,
+    const int *rows,
+    const int Nb,
+    const double *x,
+    const double *rhs,
+    double *out,
+    const unsigned int block_size
+    )
+{
+    extern __shared__ double tmp[];
+    const unsigned int warpsize = warpSize;
+    const unsigned int bsize = blockDim.x;
+    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int idx_b = gid / bsize;
+    const unsigned int idx_t = threadIdx.x;
+    unsigned int idx = idx_b * bsize + idx_t;
+    const unsigned int bs = block_size;
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = gridDim.x;
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int target_block_row = idx / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+    
+    // for 3x3 blocks:
+    // num_active_threads: 27 (CUDA) vs 63 (ROCM)
+    // num_blocks_per_warp: 3 (CUDA) vs  7 (ROCM)
+    int offsetTarget = warpsize == 64 ? 48 : 32; 
+    
+    while(target_block_row < Nb){
+        unsigned int first_block = rows[target_block_row];
+        unsigned int last_block = rows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            for(; block < last_block; block += num_blocks_per_warp){
+                double x_elem = x[cols[block]*bs + c];
+                double A_elem = vals[block*bs*bs + c + r*bs];
+                local_out += x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out; 
+        
+        for(unsigned int offset = 3; offset <= offsetTarget; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            __syncthreads();
+        }
+        
+        if(lane < bs){
+            unsigned int row = target_block_row*bs + lane;
+            out[row] = rhs[row] - tmp[lane];
+        }
+        target_block_row += num_warps_in_grid;
+    }
+}
+
+// KERNEL residual_k    
+// res = rhs - mat * x
+// algorithm based on:
+// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+// template <unsigned shared_mem_size>
+__global__ void residual_k(
+    const double *vals,
+    const int *cols,
+    const int *rows,
+    const int N,
+    const double *x,
+    const double *rhs,
+    double *out
+    )
+{
+    extern __shared__ double tmp[];
+    const unsigned int bsize = blockDim.x;
+    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int idx_b = gid / bsize;
+    const unsigned int idx_t = threadIdx.x;
+    const unsigned int num_workgroups = gridDim.x;
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        double local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        __syncthreads();
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            __syncthreads();
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] = rhs[row] - tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
+
+__global__ void spmv_k(
+    const double *vals,
+    const int *cols,
+    const int *rows,
+    const int N,
+    const double *x,
+    double *out
+    )
+{
+    extern __shared__ double tmp[];
+    const unsigned int bsize = blockDim.x;
+    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int idx_b = gid / bsize; 
+    const unsigned int idx_t = threadIdx.x;
+    const unsigned int num_workgroups = gridDim.x;
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        double local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        __syncthreads();
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            __syncthreads();
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] = tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
+#endif
+
+void HipKernels::init(int verbosity_)
+{
+    if (initialized) {
+        OpmLog::debug("Warning HipKernels is already initialized");
+        return;
+    }
+
+    verbosity = verbosity_;
+
+    initialized = true;
+}
+
+void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream)
+{
+    Timer t_vmul;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(N, blockDim);
+    unsigned gridDim = blocks * blockDim;
+
+    // dim3(N) will create a vector {N, 1, 1}
+    vmul_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(alpha, in1, in2, out, N);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error vmul for rocsparse only supported when compiling with hipcc");
+#endif
+
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels vmul() time: " << t_vmul.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream)
+{
+    Timer t;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = blocks * blockDim;
+
+    // dim3(N) will create a vector {N, 1, 1}
+    full_to_pressure_restriction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(fine_y, weights, coarse_y, Nb);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error full_to_pressure_restriction for rocsparse only supported when compiling with hipcc");
+#endif
+    
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels full_to_pressure_restriction() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream)
+{
+    Timer t;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = blocks * blockDim;
+
+    // dim3(N) will create a vector {N, 1, 1}
+    add_coarse_pressure_correction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(coarse_x, fine_x, pressure_idx, Nb);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error add_coarse_pressure_correction for rocsparse only supported when compiling with hipcc");
+#endif
+
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void HipKernels::prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream)
+{
+    Timer t;
+    
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(N, blockDim);
+    unsigned gridDim = blocks * blockDim;
+    unsigned shared_mem_size = blockDim * sizeof(double); 
+
+    // dim3(N) will create a vector {N, 1, 1}
+    prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error prolongate_vector for rocsparse only supported when compiling with hipcc");
+#endif
+
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels prolongate_vector() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream)
+{
+    Timer t_residual;
+
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = num_work_groups * blockDim;
+    unsigned shared_mem_size = blockDim * sizeof(double); 
+
+    if (block_size > 1) {
+        // dim3(N) will create a vector {N, 1, 1}
+        residual_blocked_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out, block_size);
+    } else {
+        residual_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out);
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error residual for rocsparse only supported when compiling with hipcc");
+#endif
+    
+    if (verbosity >= 4) {
+        HIP_CHECK(hipStreamSynchronize(stream));
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels residual() time: " << t_residual.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream)
+{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
+    Timer t_spmv;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = num_work_groups * blockDim;
+    unsigned shared_mem_size = blockDim * sizeof(double); 
+
+   spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
+    
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error spmv for rocsparse only supported when compiling with hipcc");
+#endif
+    
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels spmv_blocked() time: " << t_spmv.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+
+} // namespace Opm
diff --git a/opm/simulators/linalg/bda/rocm/hipKernels.hpp b/opm/simulators/linalg/bda/rocm/hipKernels.hpp
new file mode 100644
index 000000000..a70733679
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocm/hipKernels.hpp
@@ -0,0 +1,57 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef HIPKERNELS_HPP
+#define HIPKERNELS_HPP
+
+#include <string>
+#include <memory>
+#include <cstddef>
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_version.h>
+
+// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
+
+namespace Opm
+{
+
+class HipKernels
+{
+private:
+    static int verbosity;
+    static double* tmp;     // used as tmp CPU buffer for dot() and norm()
+    static bool initialized;
+    static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
+    
+    HipKernels(){}; // disable instantiation
+    
+public:
+    static void init(int verbosity);
+    static void full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream);
+    static void add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream);
+    static void vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream);
+    static void prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream);
+    static void residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream);
+    static void spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream);
+};
+
+} // namespace Opm
+
+#endif
diff --git a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
index 5092b747f..986ab3bba 100644
--- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
@@ -301,17 +301,4 @@ apply(double& y, double& x) {
     template class rocsparseBILU0<T,6>;
 
 INSTANCE_TYPE(double)
-// #define INSTANTIATE_BDA_FUNCTIONS(n) \
-// template class rocsparseBILU0<n>;
-// 
-// INSTANTIATE_BDA_FUNCTIONS(1);
-// INSTANTIATE_BDA_FUNCTIONS(2);
-// INSTANTIATE_BDA_FUNCTIONS(3);
-// INSTANTIATE_BDA_FUNCTIONS(4);
-// INSTANTIATE_BDA_FUNCTIONS(5);
-// INSTANTIATE_BDA_FUNCTIONS(6);
-// 
-// #undef INSTANTIATE_BDA_FUNCTIONS
-// 
-// } // namespace Accelerator
 } // namespace Opm
diff --git a/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp b/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
new file mode 100644
index 000000000..133847967
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
@@ -0,0 +1,361 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <dune/common/shared_ptr.hh>
+
+#include <opm/simulators/linalg/PreconditionerFactory.hpp>
+#include <opm/simulators/linalg/PropertyTree.hpp>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
+#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_version.h>
+
+#define HIP_CHECK(STAT)                                  \
+    do {                                                 \
+        const hipError_t stat = (STAT);                  \
+        if(stat != hipSuccess)                           \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::hip ";       \
+            oss << "error: " << hipGetErrorString(stat); \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+#define ROCSPARSE_CHECK(STAT)                            \
+    do {                                                 \
+        const rocsparse_status stat = (STAT);            \
+        if(stat != rocsparse_status_success)             \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::rocsparse "; \
+            oss << "error: " << stat;                    \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+#define ROCBLAS_CHECK(STAT)                              \
+    do {                                                 \
+        const rocblas_status stat = (STAT);              \
+        if(stat != rocblas_status_success)               \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::rocblas ";   \
+            oss << "error: " << stat;                    \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+namespace Opm::Accelerator {
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <class Scalar, unsigned int block_size>
+rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :
+    rocsparsePreconditioner<Scalar, block_size>(verbosity_)
+{
+    bilu0 = std::make_unique<rocsparseBILU0<Scalar, block_size> >(verbosity_);
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
+    this->Nb = matrix->Nb;
+    this->nnzb = matrix->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bilu0->set_context(this->handle, this->dir, this->operation, this->stream);
+    bilu0->setJacMat(*this->jacMat.get());
+    bilu0->initialize(matrix,jacMatrix,d_Arows,d_Acols);
+    return true;
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+copy_system_to_gpu(Scalar *b) {
+    bilu0->copy_system_to_gpu(b);
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+update_system_on_gpu(Scalar *vals) {
+    bilu0->update_system_on_gpu(vals);
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+analyze_matrix(BlockedMatrix<Scalar> *mat_) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+    
+    bool success = bilu0->analyze_matrix(mat_);
+    
+    this->mat = mat_;  
+    
+    return success;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_, jacMat);
+    this->mat = mat_;
+
+    return success;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::create_preconditioner(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_, jacMat);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    Dune::Timer t_amg;
+    this->create_preconditioner_amg(this->mat);
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
+    std::call_once(rocm_buffers_allocated, init_func);
+
+    // upload matrices and vectors to GPU
+    rocm_upload();
+
+    return result;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+create_preconditioner(BlockedMatrix<Scalar> *mat_) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    this->create_preconditioner_amg(this->mat); 
+    
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
+    std::call_once(rocm_buffers_allocated, init_func);
+
+    // upload matrices and vectors to GPU
+    rocm_upload();
+    
+    return result;
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+init_rocm_buffers() {
+    d_Amatrices.reserve(this->num_levels);
+    d_Rmatrices.reserve(this->num_levels - 1);
+    d_invDiags.reserve(this->num_levels - 1);
+    for (Matrix<Scalar>& m : this->Amatrices) {
+        d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);//TOCHECK: this memory might not be contigous!!
+    }
+    
+    for (Matrix<Scalar>& m : this->Rmatrices) {
+        d_Rmatrices.emplace_back(m.N, m.M, m.nnzs, 1);
+        d_f.emplace_back(m.N);
+        d_u.emplace_back(m.N);
+        d_PcolIndices.emplace_back(m.M);
+        d_invDiags.emplace_back(m.M);
+        d_t.emplace_back(m.M);
+    }
+    
+    d_weights.emplace_back(this->N);
+    d_rs.emplace_back(this->N);
+    d_mat = std::make_unique<RocmMatrix<Scalar>>(this->Nb, this->Nb, this->nnzb, block_size);
+    
+    d_coarse_y.emplace_back(this->Nb);
+    d_coarse_x.emplace_back(this->Nb);
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+rocm_upload() {
+     d_mat->upload(this->mat, this->stream);
+     
+     HIP_CHECK(hipMemcpyAsync(d_weights.data()->nnzValues, this->weights.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, this->stream));
+    
+    for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
+        d_Amatrices[i].upload(&this->Amatrices[i], this->stream);
+        
+        HIP_CHECK(hipMemcpyAsync(d_invDiags[i].nnzValues, this->invDiags[i].data(), sizeof(Scalar) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_PcolIndices[i].nnzValues, this->PcolIndices[i].data(), sizeof(int) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
+    }
+    
+    for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
+        d_Rmatrices[i].upload(&this->Rmatrices[i], this->stream);
+    }
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
+    RocmMatrix<Scalar> *A = &d_Amatrices[level];
+    RocmMatrix<Scalar> *R = &d_Rmatrices[level];
+    int Ncur = A->Nb;
+    Scalar zero = 0.0;
+    Scalar one = 1.0;
+    
+    rocsparse_mat_info spmv_info;
+    rocsparse_mat_descr descr_R;
+    ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
+    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_R));
+
+    if (level == this->num_levels - 1) {
+        // solve coarsest level
+        std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
+
+        HIP_CHECK(hipMemcpyAsync(h_y.data(), &y, sizeof(Scalar) * Ncur, hipMemcpyDeviceToHost, this->stream));
+        
+        // solve coarsest level using umfpack
+        this->umfpack.apply(h_x.data(), h_y.data());
+
+        HIP_CHECK(hipMemcpyAsync(&x, h_x.data(), sizeof(Scalar) * Ncur, hipMemcpyHostToDevice, this->stream));
+        
+        return;
+    }
+    
+    int Nnext = d_Amatrices[level+1].Nb;
+
+    RocmVector<Scalar>& t = d_t[level];
+    RocmVector<Scalar>& f = d_f[level];
+    RocmVector<Scalar>& u = d_u[level]; // u was 0-initialized earlier
+
+    // presmooth
+    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
+    for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
+        HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+        HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
+    }
+
+    // move to coarser level
+    HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+
+// TODO: understand why rocsparse spmv library function does not here. 
+//     ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
+//                                          R->Nb, R->Mb, R->nnzbs, &one, descr_R,
+//                                          R->nnzValues, R->rowPointers, R->colIndices, 1,
+//                                          t.nnzValues, &zero, f.nnzValues));
+    HipKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
+
+    amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
+    HipKernels::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
+
+    // postsmooth
+    for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
+        HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+        HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
+    }
+}
+
+// x = prec(y)
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+apply_amg(const Scalar& y, Scalar& x) {
+    HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
+    
+    for (unsigned int i = 0; i < d_u.size(); ++i) {
+        d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
+    }
+    
+    HipKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
+    
+    HipKernels::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
+    
+    amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));
+
+    HipKernels::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+apply(Scalar& y, Scalar& x) {
+    Dune::Timer t_bilu0;
+
+    bilu0->apply(y, x);
+
+    if (verbosity >= 3) {
+        HIP_CHECK(hipStreamSynchronize(this->stream));
+        std::ostringstream out;
+        out << "rocsparseCPR apply bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    apply_amg(y, x);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR apply amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+#define INSTANCE_TYPE(T)                \
+    template class rocsparseCPR<T,1>; \
+    template class rocsparseCPR<T,2>; \
+    template class rocsparseCPR<T,3>; \
+    template class rocsparseCPR<T,4>; \
+    template class rocsparseCPR<T,5>; \
+    template class rocsparseCPR<T,6>;
+
+INSTANCE_TYPE(double)
+} // namespace Opm
+
+
diff --git a/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp b/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
new file mode 100644
index 000000000..9dad1671a
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
@@ -0,0 +1,112 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ROCSPARSECPR_HPP
+#define OPM_ROCSPARSECPR_HPP
+
+#include <mutex>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/CprCreation.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class BlockedMatrix;
+
+/// This class implements a Constrained Pressure Residual (CPR) preconditioner
+template <class Scalar, unsigned int block_size>
+class rocsparseCPR : public rocsparsePreconditioner<Scalar, block_size>, public CprCreation<Scalar, block_size>
+{
+    typedef rocsparsePreconditioner<Scalar, block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+
+private:
+    std::vector<RocmMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
+    
+    std::vector<RocmVector<int>> d_PcolIndices; // prolongation does not need a full matrix, only store colIndices
+    std::vector<RocmVector<Scalar>> d_invDiags; // inverse of diagonal of Amatrices
+    std::vector<RocmVector<Scalar>> d_t, d_f; // intermediate vectors used during amg cycle
+    std::vector<RocmVector<Scalar>> d_u; // intermediate vectors used during amg cycle
+    std::vector<RocmVector<Scalar>> d_rs;      // use before extracting the pressure
+    std::vector<RocmVector<Scalar>> d_weights; // the quasiimpes weights, used to extract pressure
+    std::unique_ptr<RocmMatrix<Scalar>> d_mat;   // stores blocked matrix
+    std::vector<RocmVector<Scalar>> d_coarse_y, d_coarse_x; // stores the scalar vectors
+    std::once_flag rocm_buffers_allocated;  // only allocate OpenCL Buffers once
+
+    std::unique_ptr<rocsparseBILU0<Scalar, block_size> > bilu0;                    // Blocked ILU0 preconditioner
+
+    std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar
+
+    // Initialize and allocate matrices and vectors
+    void init_rocm_buffers();//TODO: rename to rocm/c and update in code
+
+    // Copy matrices and vectors to GPU
+    void rocm_upload();//TODO: rename to rocm/c and update in code
+
+    // apply pressure correction to vector
+    void apply_amg(const Scalar& y, Scalar& x);
+
+    void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
+    
+public:
+
+    rocsparseCPR(int verbosity);
+
+    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
+    
+    void copy_system_to_gpu(Scalar *b);
+
+    /// Reassign pointers, in case the addresses of the Dune variables have changed
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] b              input vector b, contains N values
+//     void update_system(Scalar *vals, Scalar *b);
+
+    /// Update linear system to GPU
+    /// \param[in] b              input vector, contains N values
+    void update_system_on_gpu(Scalar *b);
+
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat);
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
+
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat);
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
+    
+#if HAVE_OPENCL
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y, cl::Buffer& x) {}
+#endif
+    // applies blocked ilu0
+    // also applies amg for pressure component
+    void apply(Scalar& y, Scalar& x);
+};
+
+} // namespace Opm
+
+#endif
+
diff --git a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
new file mode 100644
index 000000000..95966b422
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
@@ -0,0 +1,113 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+
+#include <sstream>
+#include <iostream>
+
+// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
+
+#define HIP_CHECK(STAT)                                  \
+    do {                                                 \
+        const hipError_t stat = (STAT);                  \
+        if(stat != hipSuccess)                           \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::hip ";       \
+            oss << "error: " << hipGetErrorString(stat); \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+namespace Opm::Accelerator {
+
+template<class Scalar>
+RocmMatrix<Scalar>::
+RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
+    : Nb(Nb_),
+      Mb(Mb_),
+      nnzbs(nnzbs_),
+      block_size(block_size_)
+{
+    HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(double) * block_size * block_size * nnzbs));
+        
+    HIP_CHECK(hipMalloc((void**)&colIndices, sizeof(int) * nnzbs));
+
+    HIP_CHECK(hipMalloc((void**)&rowPointers, sizeof(int) * (Nb + 1)));
+}
+
+template <class Scalar>
+void RocmMatrix<Scalar>::
+upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {
+    HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
+    
+    HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
+    
+    HIP_CHECK(hipMemcpyAsync(rowPointers, rows, sizeof(int) * (Nb + 1), hipMemcpyHostToDevice, stream));
+}
+
+template <class Scalar>
+void RocmMatrix<Scalar>::
+upload(Matrix<Scalar> *matrix, hipStream_t stream) {
+    if (block_size != 1) {
+        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
+    }
+
+    upload(matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data(), stream);
+}
+
+template <class Scalar>
+void RocmMatrix<Scalar>::
+upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream) {
+    if (matrix->block_size != block_size) {
+        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
+    }
+
+    upload(matrix->nnzValues, matrix->colIndices, matrix->rowPointers, stream);
+}
+
+template <class Scalar>
+RocmVector<Scalar>::RocmVector(int N)
+    : size(N)
+{
+    HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * N));
+}
+
+template <class Scalar>
+void RocmVector<Scalar>::
+upload(Scalar *vals, hipStream_t stream) {
+    HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));    
+}
+
+#define INSTANCE_TYPE(T)  \
+template class RocmVector<T>;\
+template class RocmMatrix<T>;
+
+INSTANCE_TYPE(int);
+INSTANCE_TYPE(double);
+
+} // namespace Opm
diff --git a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
new file mode 100644
index 000000000..53feb1fea
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
@@ -0,0 +1,65 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ROCMMATRIX_HEADER_INCLUDED
+#define OPM_ROCMMATRIX_HEADER_INCLUDED
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_version.h>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class Matrix;
+template<class Scalar> class BlockedMatrix;
+
+/// This struct resembles a csr matrix, only doubles are supported
+/// The matrix data is stored in OpenCL Buffers
+template<class Scalar>
+class RocmMatrix {
+public:
+
+    RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);
+
+    void upload(Scalar *vals, int *cols, int *rows, hipStream_t stream);
+    void upload(Matrix<Scalar> *matrix, hipStream_t stream);
+    void upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream);
+
+    Scalar* nnzValues;
+    int* colIndices;
+    int* rowPointers;
+    int Nb, Mb;
+    int nnzbs;
+    unsigned int block_size;
+};
+
+template <typename Scalar>
+class RocmVector {
+public:
+
+    RocmVector(int N);
+
+    void upload(Scalar *vals, hipStream_t stream);
+    void upload(Matrix<Scalar> *matrix, hipStream_t stream);
+
+    Scalar* nnzValues;
+    int size;
+};
+} // namespace Opm
+
+#endif
diff --git a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
index 29bf0dc81..937e70e76 100644
--- a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
@@ -138,11 +138,11 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
     ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
     
     using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
-//     if (use_cpr) {
-//         prec = rocsparsePreconditioner<block_size>::create(PreconditionerType::CPR, verbosity);
-//     } else {
+    if (use_cpr) {
+        prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity);
+    } else {
         prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity);
-//     }
+    }
     
     prec->set_context(handle, dir, operation, stream);
 }