cleanup and run fix

2025-02-25 18:55:30 -06:00 · 2024-06-05 15:00:47 +02:00 · 2024-06-05 15:00:47 +02:00 · dcbd9be46a
commit dcbd9be46a
parent 7a307fafa0
15 changed files with 490 additions and 341 deletions
--- a/opm/simulators/linalg/bda/CprCreation.cpp
+++ b/opm/simulators/linalg/bda/CprCreation.cpp
@ -47,7 +47,8 @@ CprCreation<Scalar, block_size>::CprCreation()

 template <class Scalar, unsigned int block_size>
 void CprCreation<Scalar, block_size>::
-create_preconditioner_amg(BlockedMatrix<Scalar> *mat_) {
+create_preconditioner_amg(BlockedMatrix<Scalar> *mat_)
+{
    mat = mat_;
    cprNb = mat_->Nb;
    cprnnzb = mat_->nnzbs;
@ -183,7 +184,9 @@ create_preconditioner_amg(BlockedMatrix<Scalar> *mat_) {
 }

 template <class Scalar, unsigned int block_size>
-void CprCreation<Scalar, block_size>::analyzeHierarchy() {
+void CprCreation<Scalar, block_size>::
+analyzeHierarchy()
+{
    const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();

    // store coarsest AMG level in umfpack format, also performs LU decomposition
@ -237,8 +240,9 @@ void CprCreation<Scalar, block_size>::analyzeHierarchy() {


 template <class Scalar, unsigned int block_size>
-void CprCreation<Scalar, block_size>::analyzeAggregateMaps() {
-
+void CprCreation<Scalar, block_size>::
+analyzeAggregateMaps() 
+{
    PcolIndices.resize(num_levels - 1);
    Rmatrices.clear();

--- a/opm/simulators/linalg/bda/Misc.cpp
+++ b/opm/simulators/linalg/bda/Misc.cpp
@ -6,14 +6,16 @@
 namespace Opm::Accelerator {

 // divide A by B, and round up: return (int)ceil(A/B)
-unsigned int ceilDivision(const unsigned int A, const unsigned int B)
+unsigned int ceilDivision(const unsigned int A,
+                          const unsigned int B)
 {
    return A / B + (A % B > 0);
 }

 // return the absolute value of the N elements for which the absolute value is highest
 template<class Scalar>
-Scalar get_absmax(const Scalar* data, const int N)
+Scalar get_absmax(const Scalar* data,
+                  const int N)
 {
    return std::abs(*std::max_element(data, data + N,
                                      [](Scalar a, Scalar b)
@ -22,7 +24,9 @@ Scalar get_absmax(const Scalar* data, const int N)

 // solve A^T * x = b
 template<class Scalar>
-void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
+void solve_transposed_3x3(const Scalar* A, 
+                          const Scalar* b,
+                          Scalar* x)
 {
    const int B = 3;
    // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
--- a/opm/simulators/linalg/bda/Misc.hpp
+++ b/opm/simulators/linalg/bda/Misc.hpp
@ -1,15 +1,58 @@
 #ifndef OPM_MISC_HPP
 #define OPM_MISC_HPP

+#include <hip/hip_runtime_api.h>
+#include <hip/hip_version.h>
+
+#define HIP_CHECK(STAT)                                  \
+    do {                                                 \
+        const hipError_t stat = (STAT);                  \
+        if(stat != hipSuccess)                           \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::hip ";       \
+            oss << "error: " << hipGetErrorString(stat); \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+#define ROCSPARSE_CHECK(STAT)                            \
+    do {                                                 \
+        const rocsparse_status stat = (STAT);            \
+        if(stat != rocsparse_status_success)             \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::rocsparse "; \
+            oss << "error: " << stat;                    \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+#define ROCBLAS_CHECK(STAT)                              \
+    do {                                                 \
+        const rocblas_status stat = (STAT);              \
+        if(stat != rocblas_status_success)               \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::rocblas ";   \
+            oss << "error: " << stat;                    \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
 namespace Opm::Accelerator {

-unsigned int ceilDivision(const unsigned int A, const unsigned int B);
+unsigned int ceilDivision(const unsigned int A,
+                          const unsigned int B);

 template<class Scalar>
-Scalar get_absmax(const Scalar *data, const int N);
+Scalar get_absmax(const Scalar *data,
+                  const int N);

 template<class Scalar>
-void solve_transposed_3x3(const Scalar *A, const Scalar *b, Scalar *x);
+void solve_transposed_3x3(const Scalar *A,
+                          const Scalar *b,
+                          Scalar *x);

 }

--- a/opm/simulators/linalg/bda/rocm/hipKernels.cpp
+++ b/opm/simulators/linalg/bda/rocm/hipKernels.cpp
@ -28,43 +28,26 @@
 #include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>

 #include <opm/simulators/linalg/bda/Misc.hpp>
-
 #include <hip/hip_runtime.h>
-#include <hip/hip_version.h>

-#define HIP_CHECK(STAT)                                  \
-    do {                                                 \
-        const hipError_t stat = (STAT);                  \
-        if(stat != hipSuccess)                           \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::hip ";       \
-            oss << "error: " << hipGetErrorString(stat); \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-namespace Opm
-{
+namespace Opm {

 using Opm::OpmLog;
 using Dune::Timer;

 // define static variables and kernels
-int HipKernels::verbosity;
-double* HipKernels::tmp;
-bool HipKernels::initialized = false;
-std::size_t HipKernels::preferred_workgroup_size_multiple = 0;
+template<class Scalar> int  HipKernels<Scalar>::verbosity = 0;
+template<class Scalar> bool HipKernels<Scalar>::initialized = false;

 #ifdef __HIP__
 /// HIP kernel to multiply vector with another vector and a scalar, element-wise
 // add result to a third vector
-__global__ void vmul_k(
-    const double alpha,
-    double const *in1,
-    double const *in2,
-    double *out,
-    const int N)
+template<class Scalar>
+__global__ void vmul_k(const Scalar alpha,
+                       Scalar const *in1,
+                       Scalar const *in2,
+                       Scalar *out,
+                       const int N)
 {
    unsigned int NUM_THREADS = gridDim.x;
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
@ -77,18 +60,18 @@ __global__ void vmul_k(

 /// HIP kernel to transform blocked vector to scalar vector using pressure-weights
 // every workitem handles one blockrow
-__global__ void full_to_pressure_restriction_k(
-    const double *fine_y,
-    const double *weights,
-    double *coarse_y,
-    const unsigned int Nb)
+template<class Scalar>
+__global__ void full_to_pressure_restriction_k(const Scalar *fine_y,
+                                               const Scalar *weights,
+                                               Scalar *coarse_y,
+                                               const unsigned int Nb)
 {
    const unsigned int NUM_THREADS = gridDim.x;
    const unsigned int block_size = 3;
    unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;

    while(target_block_row < Nb){
-        double sum = 0.0;
+        Scalar sum = 0.0;
        unsigned int idx = block_size * target_block_row;
        for (unsigned int i = 0; i < block_size; ++i) {
            sum += fine_y[idx + i] * weights[idx + i];
@ -100,11 +83,11 @@ __global__ void full_to_pressure_restriction_k(

 /// HIP kernel to add the coarse pressure solution back to the finer, complete solution
 // every workitem handles one blockrow
-__global__ void add_coarse_pressure_correction_k(
-    const double *coarse_x,
-    double *fine_x,
-    const unsigned int pressure_idx,
-    const unsigned int Nb)
+template<class Scalar>
+__global__ void add_coarse_pressure_correction_k(const Scalar *coarse_x,
+                                                 Scalar *fine_x,
+                                                 const unsigned int pressure_idx,
+                                                 const unsigned int Nb)
 {
    const unsigned int NUM_THREADS = gridDim.x;
    const unsigned int block_size = 3;
@ -118,11 +101,11 @@ __global__ void add_coarse_pressure_correction_k(

 /// HIP kernel to prolongate vector during amg cycle
 // every workitem handles one row
-__global__ void prolongate_vector_k(
-     const double *in,
-     double *out,
-     const int *cols,
-     const unsigned int N)
+template<class Scalar>
+__global__ void prolongate_vector_k(const Scalar *in,
+                                    Scalar *out,
+                                    const int *cols,
+                                    const unsigned int N)
 {
    const unsigned int NUM_THREADS = gridDim.x;
    unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
@ -137,18 +120,17 @@ __global__ void prolongate_vector_k(
 // algorithm based on:
 // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
 // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
-__global__ void residual_blocked_k(
-    const double *vals,
-    const int *cols,
-    const int *rows,
-    const int Nb,
-    const double *x,
-    const double *rhs,
-    double *out,
-    const unsigned int block_size
-    )
+template<class Scalar>
+__global__ void residual_blocked_k(const Scalar *vals,
+                                   const int *cols,
+                                   const int *rows,
+                                   const int Nb,
+                                   const Scalar *x,
+                                   const Scalar *rhs,
+                                   Scalar *out,
+                                   const unsigned int block_size)
 {
-    extern __shared__ double tmp[];
+    extern __shared__ Scalar tmp[];
    const unsigned int warpsize = warpSize;
    const unsigned int bsize = blockDim.x;
    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
@ -174,12 +156,12 @@ __global__ void residual_blocked_k(
        unsigned int first_block = rows[target_block_row];
        unsigned int last_block = rows[target_block_row+1];
        unsigned int block = first_block + lane / (bs*bs);
-        double local_out = 0.0;
+        Scalar local_out = 0.0;

        if(lane < num_active_threads){
            for(; block < last_block; block += num_blocks_per_warp){
-                double x_elem = x[cols[block]*bs + c];
-                double A_elem = vals[block*bs*bs + c + r*bs];
+                Scalar x_elem = x[cols[block]*bs + c];
+                Scalar A_elem = vals[block*bs*bs + c + r*bs];
                local_out += x_elem * A_elem;
            }
        }
@ -210,17 +192,16 @@ __global__ void residual_blocked_k(
 // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
 // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
 // template <unsigned shared_mem_size>
-__global__ void residual_k(
-    const double *vals,
-    const int *cols,
-    const int *rows,
-    const int N,
-    const double *x,
-    const double *rhs,
-    double *out
-    )
+template<class Scalar>
+__global__ void residual_k(const Scalar *vals,
+                           const int *cols,
+                           const int *rows,
+                           const int N,
+                           const Scalar *x,
+                           const Scalar *rhs,
+                           Scalar *out)
 {
-    extern __shared__ double tmp[];
+    extern __shared__ Scalar tmp[];
    const unsigned int bsize = blockDim.x;
    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idx_b = gid / bsize;
@ -233,7 +214,7 @@ __global__ void residual_k(
        int rowStart = rows[row];
        int rowEnd = rows[row+1];
        int rowLength = rowEnd - rowStart;
-        double local_sum = 0.0;
+        Scalar local_sum = 0.0;
        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
            int col = cols[j];
            local_sum += vals[j] * x[col];
@ -259,16 +240,15 @@ __global__ void residual_k(
    }
 }

-__global__ void spmv_k(
-    const double *vals,
-    const int *cols,
-    const int *rows,
-    const int N,
-    const double *x,
-    double *out
-    )
+template<class Scalar>
+__global__ void spmv_k(const Scalar *vals,
+                       const int *cols,
+                       const int *rows,
+                       const int N,
+                       const Scalar *x,
+                       Scalar *out)
 {
-    extern __shared__ double tmp[];
+    extern __shared__ Scalar tmp[];
    const unsigned int bsize = blockDim.x;
    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idx_b = gid / bsize; 
@ -281,7 +261,7 @@ __global__ void spmv_k(
        int rowStart = rows[row];
        int rowEnd = rows[row+1];
        int rowLength = rowEnd - rowStart;
-        double local_sum = 0.0;
+        Scalar local_sum = 0.0;
        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
            int col = cols[j];
            local_sum += vals[j] * x[col];
@ -308,7 +288,9 @@ __global__ void spmv_k(
 }
 #endif

-void HipKernels::init(int verbosity_)
+template<class Scalar>
+void HipKernels<Scalar>::
+init(int verbosity_)
 {
    if (initialized) {
        OpmLog::debug("Warning HipKernels is already initialized");
@ -320,7 +302,14 @@ void HipKernels::init(int verbosity_)
    initialized = true;
 }

-void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream)
+template<class Scalar>
+void HipKernels<Scalar>::
+vmul(const Scalar alpha,
+     Scalar* in1,
+     Scalar* in2,
+     Scalar* out,
+     int N,
+     hipStream_t stream)
 {
    Timer t_vmul;
 #ifdef __HIP__
@ -342,7 +331,13 @@ void HipKernels::vmul(const double alpha, double* in1, double* in2, double* out,
    }
 }

-void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream)
+template<class Scalar>
+void HipKernels<Scalar>::
+full_to_pressure_restriction(const Scalar* fine_y,
+                             Scalar* weights,
+                             Scalar* coarse_y,
+                             int Nb,
+                             hipStream_t stream)
 {
    Timer t;
 #ifdef __HIP__
@ -364,7 +359,13 @@ void HipKernels::full_to_pressure_restriction(const double* fine_y, double* weig
    }
 }

-void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream)
+template<class Scalar>
+void HipKernels<Scalar>::
+add_coarse_pressure_correction(Scalar* coarse_x,
+                               Scalar* fine_x,
+                               int pressure_idx,
+                               int Nb,
+                               hipStream_t stream)
 {
    Timer t;
 #ifdef __HIP__
@ -386,7 +387,13 @@ void HipKernels::add_coarse_pressure_correction(double* coarse_x, double* fine_x
    }
 }

-void HipKernels::prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream)
+template<class Scalar>
+void HipKernels<Scalar>::
+prolongate_vector(const Scalar* in,
+                  Scalar* out,
+                  const int* cols,
+                  int N,
+                  hipStream_t stream)
 {
    Timer t;
    
@ -394,7 +401,7 @@ void HipKernels::prolongate_vector(const double* in, double* out, const int* col
    unsigned blockDim = 64;
    unsigned blocks = Accelerator::ceilDivision(N, blockDim);
    unsigned gridDim = blocks * blockDim;
-    unsigned shared_mem_size = blockDim * sizeof(double); 
+    unsigned shared_mem_size = blockDim * sizeof(Scalar); 

    // dim3(N) will create a vector {N, 1, 1}
    prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
@ -410,7 +417,17 @@ void HipKernels::prolongate_vector(const double* in, double* out, const int* col
    }
 }

-void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream)
+template<class Scalar>
+void HipKernels<Scalar>::
+residual(Scalar* vals,
+         int* cols,
+         int* rows,
+         Scalar* x,
+         const Scalar* rhs,
+         Scalar* out,
+         int Nb,
+         unsigned int block_size,
+         hipStream_t stream)
 {
    Timer t_residual;

@ -418,7 +435,7 @@ void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const d
    unsigned blockDim = 64;
    const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
    unsigned gridDim = num_work_groups * blockDim;
-    unsigned shared_mem_size = blockDim * sizeof(double); 
+    unsigned shared_mem_size = blockDim * sizeof(Scalar); 

    if (block_size > 1) {
        // dim3(N) will create a vector {N, 1, 1}
@ -439,14 +456,23 @@ void HipKernels::residual(double* vals, int* cols, int* rows, double* x, const d
    }
 }

-void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream)
+template<class Scalar>
+void HipKernels<Scalar>::
+spmv(Scalar* vals,
+     int* cols,
+     int* rows,
+     Scalar* x,
+     Scalar* y,
+     int Nb,
+     unsigned int block_size,
+     hipStream_t stream)
 {//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
    Timer t_spmv;
 #ifdef __HIP__
    unsigned blockDim = 64;
    const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
    unsigned gridDim = num_work_groups * blockDim;
-    unsigned shared_mem_size = blockDim * sizeof(double); 
+    unsigned shared_mem_size = blockDim * sizeof(Scalar); 

   spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
    
@ -462,5 +488,6 @@ void HipKernels::spmv(double* vals, int* cols, int* rows, double* x, double* y,
    }
 }

+template class HipKernels<double>;

 } // namespace Opm
--- a/opm/simulators/linalg/bda/rocm/hipKernels.hpp
+++ b/opm/simulators/linalg/bda/rocm/hipKernels.hpp
@ -17,8 +17,8 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef HIPKERNELS_HPP
-#define HIPKERNELS_HPP
+#ifndef OPM_HIPKERNELS_HPP
+#define OPM_HIPKERNELS_HPP

 #include <string>
 #include <memory>
@ -27,29 +27,110 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_version.h>

-// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-
-namespace Opm
-{
+namespace Opm {

+template<class Scalar>
 class HipKernels
 {
 private:
-    static int verbosity;
-    static double* tmp;     // used as tmp CPU buffer for dot() and norm()
+    static int  verbosity;
    static bool initialized;
-    static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
    
-    HipKernels(){}; // disable instantiation
+    HipKernels();
    
 public:
+    /// Initialize verbosity level for the HIP kernels
+    /// \param[in] verbosity   verbosity level
    static void init(int verbosity);
-    static void full_to_pressure_restriction(const double* fine_y, double* weights, double* coarse_y, int Nb, hipStream_t stream);
-    static void add_coarse_pressure_correction(double* coarse_x, double* fine_x, int pressure_idx, int Nb, hipStream_t stream);
-    static void vmul(const double alpha, double* in1, double* in2, double* out, int N, hipStream_t stream);
-    static void prolongate_vector(const double* in, double* out, const int* cols, int N, hipStream_t stream);
-    static void residual(double* vals, int* cols, int* rows, double* x, const double* rhs, double* out, int Nb, unsigned int block_size, hipStream_t stream);
-    static void spmv(double* vals, int* cols, int* rows, double* x, double* y, int Nb, unsigned int block_size, hipStream_t stream);
+    
+    /// Transform blocked vector to scalar vector using pressure-weights, where every workitem handles one blockrow
+    /// \param[in]  fine_y     Input y vector
+    /// \param[in]  weights    Weights used to combine cells
+    /// \param[out] course_y   Output y vector 
+    /// \param[in]  Nb         Number of blocks in the original matrix
+    /// \param[in]  stream     Hip stream to use for the computations
+    static void full_to_pressure_restriction(const Scalar* fine_y,
+                                             Scalar* weights,
+                                             Scalar* coarse_y,
+                                             int Nb,
+                                             hipStream_t stream);
+
+    /// Add the coarse pressure solution back to the finer, complete solution; every workitem handles one blockrow
+    /// \param[in]  coarse_x     Input scalar x vector
+    /// \param[out] fine_x       Output blocked x vector 
+    /// \param[in]  pressure_idx Pressure index
+    /// \param[in]  Nb           Number of blocks in the original matrix
+    /// \param[in]  stream       Hip stream to use for the computations
+    static void add_coarse_pressure_correction(Scalar* coarse_x,
+                                               Scalar* fine_x,
+                                               int pressure_idx,
+                                               int Nb,
+                                               hipStream_t stream);
+
+
+    /// Function to multiply vector with another vector and a scalar, element-wise and add the result to a third vector (out = alpha * in1 + in2) 
+    /// \param[in]  alpha   Input scalar 
+    /// \param[in]  in1     First input vector 
+    /// \param[in]  in2     Second input vector
+    /// \param[out] out     Output vector
+    /// \param[in]  N       Size of the vector
+    /// \param[in]  stream  Hip stream to use for the computations
+    static void vmul(const Scalar alpha,
+                     Scalar* in1,
+                     Scalar* in2,
+                     Scalar* out,
+                     int N,
+                     hipStream_t stream);
+    
+    /// Function to prolongate vector during amg cycle, every workitem handles one row
+    /// \param[in]  in      Input fine-grained vector
+    /// \param[out] out     Output course-graned vector 
+    /// \param[in]  cols    Column indexes
+    /// \param[in]  N       Size of the vector
+    /// \param[in]  stream  Hip stream to use for the computations
+    static void prolongate_vector(const Scalar* in,
+                                  Scalar* out,
+                                  const int* cols,
+                                  int N,
+                                  hipStream_t stream);
+
+    /// Function to perform res = rhs - mat * x
+    /// \param[in]  vals       Matrix values
+    /// \param[in]  cols       Column indexes
+    /// \param[in]  rows       Row pointers
+    /// \param[in]  x          X vector
+    /// \param[in]  rhs        Rhs vector
+    /// \param[out] out        Output res vector
+    /// \param[in]  Nb         Number of non-zero blocks in the original matrix
+    /// \param[in]  block_size Block size
+    /// \param[in]  stream     Hip stream to use for the computations
+    static void residual(Scalar* vals,
+                         int* cols,
+                         int* rows,
+                         Scalar* x,
+                         const Scalar* rhs,
+                         Scalar* out,
+                         int Nb,
+                         unsigned int block_size,
+                         hipStream_t stream);
+
+    /// Function to perform sparse matrix vector multipliation
+    /// \param[in]  vals       Matrix values
+    /// \param[in]  cols       Column indexes
+    /// \param[in]  rows       Row pointers
+    /// \param[in]  x          Input x vector
+    /// \param[out] y          Output y vector
+    /// \param[in]  Nb         Number of non-zero blocks in the original matrix
+    /// \param[in]  block_size Block size
+    /// \param[in]  stream     Hip stream to use for the computations
+    static void spmv(Scalar* vals,
+                     int* cols,
+                     int* rows,
+                     Scalar* x,
+                     Scalar* y,
+                     int Nb,
+                     unsigned int block_size,
+                     hipStream_t stream);
 };

 } // namespace Opm
--- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
@ -26,48 +26,11 @@

 #include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp> 
 #include <opm/simulators/linalg/bda/Reorder.hpp>
+#include <opm/simulators/linalg/bda/Misc.hpp>

 #include <sstream>

-#include <iostream> //Razvan
-#include <hip/hip_runtime_api.h>
-
-#define HIP_CHECK(STAT)                                  \
-    do {                                                 \
-        const hipError_t stat = (STAT);                  \
-        if(stat != hipSuccess)                           \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseBILU0::hip ";       \
-            oss << "error: " << hipGetErrorString(stat); \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCSPARSE_CHECK(STAT)                            \
-    do {                                                 \
-        const rocsparse_status stat = (STAT);            \
-        if(stat != rocsparse_status_success)             \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseBILU0::rocsparse "; \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCBLAS_CHECK(STAT)                              \
-    do {                                                 \
-        const rocblas_status stat = (STAT);              \
-        if(stat != rocblas_status_success)               \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseBILU0::rocblas ";   \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-    
+   
 #if HAVE_OPENMP
 #include <thread>
 #include <omp.h>
@ -88,7 +51,11 @@ rocsparseBILU0(int verbosity_) :

 template <class Scalar, unsigned int block_size>
 bool rocsparseBILU0<Scalar, block_size>::
-initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) { 
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+           rocsparse_int *d_Arows,
+           rocsparse_int *d_Acols) 
+{ 
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -101,14 +68,14 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<Blocke
        this->jacMat = jacMatrix;
    }

-     HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * this->N));
+    HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * this->N));

    if (this->useJacMatrix) {
        HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
        HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * this->nnzbs_prec));
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * this->nnzbs_prec * block_size * block_size));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
    } else { // preconditioner matrix is same
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * this->nnzbs_prec * block_size * block_size));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
        d_Mcols = d_Acols;
        d_Mrows = d_Arows;
    }
@ -125,10 +92,13 @@ analyze_matrix(BlockedMatrix<Scalar> *mat) {

 template <class Scalar, unsigned int block_size>
 bool rocsparseBILU0<Scalar, block_size>::
-analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) {
+analyze_matrix(BlockedMatrix<Scalar> *mat, 
+               BlockedMatrix<Scalar> *jacMat) 
+{
    std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    Timer t;
    
+    ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
 #if HIP_VERSION >= 50400000
    ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
 #endif
@ -193,7 +163,9 @@ create_preconditioner(BlockedMatrix<Scalar> *mat) {

 template <class Scalar, unsigned int block_size>
 bool rocsparseBILU0<Scalar, block_size>::
-create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) {
+create_preconditioner(BlockedMatrix<Scalar> *mat, 
+                      BlockedMatrix<Scalar> *jacMat) 
+{
    Timer t;
    bool result = true;
    
@ -219,7 +191,7 @@ create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat)

 template <class Scalar, unsigned int block_size>
 void rocsparseBILU0<Scalar, block_size>::
-copy_system_to_gpu(double *d_Avals) {
+copy_system_to_gpu(Scalar *d_Avals) {
    Timer t;
    
    if (this->useJacMatrix) {
@ -230,9 +202,9 @@ copy_system_to_gpu(double *d_Avals) {
 #endif
        HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
        HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(double) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
    } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
    }

    if (verbosity >= 3) {
@ -246,7 +218,7 @@ copy_system_to_gpu(double *d_Avals) {
 // don't copy rowpointers and colindices, they stay the same
 template <class Scalar, unsigned int block_size>
 void rocsparseBILU0<Scalar, block_size>::
-update_system_on_gpu(double *d_Avals) {
+update_system_on_gpu(Scalar *d_Avals) {
    Timer t;

    if (this->useJacMatrix) {
@ -255,9 +227,9 @@ update_system_on_gpu(double *d_Avals) {
           copyThread->join();
        }
 #endif
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(double) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
    } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
    }

    if (verbosity >= 3) {
@ -270,9 +242,9 @@ update_system_on_gpu(double *d_Avals) {

 template <class Scalar, unsigned int block_size>
 void rocsparseBILU0<Scalar, block_size>::
-apply(double& y, double& x) {
-    double zero = 0.0;
-    double one  = 1.0;
+apply(Scalar& y, Scalar& x) {
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;

    Timer t_apply;

--- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
@ -57,7 +57,7 @@ private:
 #endif

    rocsparse_int *d_Mrows, *d_Mcols; 
-    double *d_Mvals, *d_t;
+    Scalar *d_Mvals, *d_t;
    void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
    
 public:
@ -67,29 +67,57 @@ public:
    /// Initialize GPU and allocate memory
    /// \param[in] matrix     matrix A
    /// \param[in] jacMatrix  matrix for preconditioner
-    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
+    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                    rocsparse_int *d_Arows,
+                    rocsparse_int *d_Acols);

-    // analysis, extract parallelism if specified
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
    bool analyze_matrix(BlockedMatrix<Scalar> *mat);
-    bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
+    
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, analyze this as well
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat,
+                        BlockedMatrix<Scalar> *jacMat);

-    // ilu_decomposition
+    /// ILU decomposition
+    /// \param[in] mat     matrix A to decompose
    bool create_preconditioner(BlockedMatrix<Scalar> *mat) override;
-    bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) override;
+    
+    /// ILU decomposition
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, decompose this one if used
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat,
+                               BlockedMatrix<Scalar> *jacMat) override;

-    // apply preconditioner, x = prec(y)
-    // via Lz = y
-    // and Ux = z
-    void apply(double& y, double& x) override;
+    /// Apply preconditioner, x = prec(y)
+    /// via Lz = y
+    /// and Ux = z
+    /// \param[in]  y  Input y vector
+    /// \param[out] x  Output x vector
+    void apply(Scalar& y,
+               Scalar& x) override;

 #if HAVE_OPENCL
    // apply preconditioner, x = prec(y)
-    void apply(const cl::Buffer& y, cl::Buffer& x) {}
+    void apply(const cl::Buffer& y,
+               cl::Buffer& x) {}
 #endif

-    void copy_system_to_gpu(double *mVals);
-    void update_system(double *vals, double *b);
-    void update_system_on_gpu(double *b);
+    /// Copy matrix A values to GPU
+    /// \param[in]  mVals  Input values
+    void copy_system_to_gpu(Scalar *mVals);
+
+    /// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
+//     /// \param[in]  vals  New values
+//     /// \param[in] b     New b vector
+//     void update_system(Scalar *vals, Scalar *b);
+    
+    /// Update GPU values after a new assembly is done
+    /// \param[in] b     New b vector
+    void update_system_on_gpu(Scalar *b);
 };
 } // namespace Opm

--- a/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
@ -35,45 +35,6 @@

 #include <opm/simulators/linalg/bda/Misc.hpp>

-#include <hip/hip_runtime.h>
-#include <hip/hip_version.h>
-
-#define HIP_CHECK(STAT)                                  \
-    do {                                                 \
-        const hipError_t stat = (STAT);                  \
-        if(stat != hipSuccess)                           \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::hip ";       \
-            oss << "error: " << hipGetErrorString(stat); \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCSPARSE_CHECK(STAT)                            \
-    do {                                                 \
-        const rocsparse_status stat = (STAT);            \
-        if(stat != rocsparse_status_success)             \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::rocsparse "; \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCBLAS_CHECK(STAT)                              \
-    do {                                                 \
-        const rocblas_status stat = (STAT);              \
-        if(stat != rocblas_status_success)               \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::rocblas ";   \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
 namespace Opm::Accelerator {

 using Opm::OpmLog;
@ -88,7 +49,11 @@ rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :

 template <class Scalar, unsigned int block_size>
 bool rocsparseCPR<Scalar, block_size>::
-initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) {
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+           rocsparse_int *d_Arows,
+           rocsparse_int *d_Acols)
+{
    this->Nb = matrix->Nb;
    this->nnzb = matrix->nnzbs;
    this->N = Nb * block_size;
@ -129,7 +94,9 @@ analyze_matrix(BlockedMatrix<Scalar> *mat_) {

 template <class Scalar, unsigned int block_size>
 bool rocsparseCPR<Scalar, block_size>::
-analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
+analyze_matrix(BlockedMatrix<Scalar> *mat_,
+               BlockedMatrix<Scalar> *jacMat)
+{
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
    this->N = Nb * block_size;
@ -142,7 +109,10 @@ analyze_matrix(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
 }

 template <class Scalar, unsigned int block_size>
-bool rocsparseCPR<Scalar, block_size>::create_preconditioner(BlockedMatrix<Scalar> *mat_, BlockedMatrix<Scalar> *jacMat) {
+bool rocsparseCPR<Scalar, block_size>::
+create_preconditioner(BlockedMatrix<Scalar> *mat_,
+                      BlockedMatrix<Scalar> *jacMat)
+{
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_, jacMat);
    if (verbosity >= 3) {
@ -205,7 +175,7 @@ init_rocm_buffers() {
    d_Rmatrices.reserve(this->num_levels - 1);
    d_invDiags.reserve(this->num_levels - 1);
    for (Matrix<Scalar>& m : this->Amatrices) {
-        d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);//TOCHECK: this memory might not be contigous!!
+        d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);
    }
    
    for (Matrix<Scalar>& m : this->Rmatrices) {
@ -246,7 +216,10 @@ rocm_upload() {

 template <class Scalar, unsigned int block_size>
 void rocsparseCPR<Scalar, block_size>::
-amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
+amg_cycle_gpu(const int level,
+              Scalar &y,
+              Scalar &x)
+{
    RocmMatrix<Scalar> *A = &d_Amatrices[level];
    RocmMatrix<Scalar> *R = &d_Rmatrices[level];
    int Ncur = A->Nb;
@ -281,52 +254,56 @@ amg_cycle_gpu(const int level, Scalar &y, Scalar &x) {
    // presmooth
    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
    for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
-        HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
-        HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
+        HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+        HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
    }

    // move to coarser level
-    HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+    HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);

 // TODO: understand why rocsparse spmv library function does not here. 
 //     ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
 //                                          R->Nb, R->Mb, R->nnzbs, &one, descr_R,
 //                                          R->nnzValues, R->rowPointers, R->colIndices, 1,
 //                                          t.nnzValues, &zero, f.nnzValues));
-    HipKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
+    HipKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);

    amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
-    HipKernels::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
+    HipKernels<Scalar>::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);

    // postsmooth
    for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
-        HipKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
-        HipKernels::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
+        HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+        HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
    }
 }

 // x = prec(y)
 template <class Scalar, unsigned int block_size>
 void rocsparseCPR<Scalar, block_size>::
-apply_amg(const Scalar& y, Scalar& x) {
+apply_amg(const Scalar& y, 
+          Scalar& x)
+{
    HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
    
    for (unsigned int i = 0; i < d_u.size(); ++i) {
        d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
    }
    
-    HipKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
+    HipKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
    
-    HipKernels::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
+    HipKernels<Scalar>::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
    
    amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));

-    HipKernels::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
+    HipKernels<Scalar>::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
 }

 template <class Scalar, unsigned int block_size>
 void rocsparseCPR<Scalar, block_size>::
-apply(Scalar& y, Scalar& x) {
+apply(Scalar& y,
+      Scalar& x)
+{
    Dune::Timer t_bilu0;

    bilu0->apply(y, x);
--- a/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
@ -64,25 +64,69 @@ private:
    std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar

    // Initialize and allocate matrices and vectors
-    void init_rocm_buffers();//TODO: rename to rocm/c and update in code
+    void init_rocm_buffers();

    // Copy matrices and vectors to GPU
-    void rocm_upload();//TODO: rename to rocm/c and update in code
+    void rocm_upload();

    // apply pressure correction to vector
    void apply_amg(const Scalar& y, Scalar& x);

+    // Apply the AMG preconditioner
    void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
    
 public:

    rocsparseCPR(int verbosity);

-    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols);
+    /// Initialize GPU and allocate memory
+    /// \param[in] matrix     matrix A
+    /// \param[in] jacMatrix  matrix for preconditioner
+    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                    rocsparse_int *d_Arows,
+                    rocsparse_int *d_Acols);
    
+
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat);
+    
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, analyze this as well
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat,
+                        BlockedMatrix<Scalar> *jacMat);
+
+    /// Create AMG preconditioner and perform ILU decomposition
+    /// \param[in] mat     matrix A
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat);
+    
+    /// Create AMG preconditioner and perform ILU decomposition
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, decompose this one if used
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat,
+                               BlockedMatrix<Scalar> *jacMat);
+    
+    /// Apply preconditioner, x = prec(y)
+    /// applies blocked ilu0
+    /// also applies amg for pressure component
+    /// \param[in]  y  Input y vector
+    /// \param[out] x  Output x vector
+    void apply(Scalar& y,
+               Scalar& x) override;
+    
+#if HAVE_OPENCL
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y,
+               cl::Buffer& x) {}
+#endif
+    
+    /// Copy matrix A values to GPU
+    /// \param[in]  mVals  Input values
    void copy_system_to_gpu(Scalar *b);

-    /// Reassign pointers, in case the addresses of the Dune variables have changed
+    /// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] b              input vector b, contains N values
 //     void update_system(Scalar *vals, Scalar *b);
@ -90,20 +134,6 @@ public:
    /// Update linear system to GPU
    /// \param[in] b              input vector, contains N values
    void update_system_on_gpu(Scalar *b);
-
-    bool analyze_matrix(BlockedMatrix<Scalar> *mat);
-    bool analyze_matrix(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
-
-    bool create_preconditioner(BlockedMatrix<Scalar> *mat);
-    bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat);
-    
-#if HAVE_OPENCL
-    // apply preconditioner, x = prec(y)
-    void apply(const cl::Buffer& y, cl::Buffer& x) {}
-#endif
-    // applies blocked ilu0
-    // also applies amg for pressure component
-    void apply(Scalar& y, Scalar& x);
 };

 } // namespace Opm
--- a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
@ -25,29 +25,19 @@
 #include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/Misc.hpp>

 #include <sstream>
 #include <iostream>

-// #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-
-#define HIP_CHECK(STAT)                                  \
-    do {                                                 \
-        const hipError_t stat = (STAT);                  \
-        if(stat != hipSuccess)                           \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::hip ";       \
-            oss << "error: " << hipGetErrorString(stat); \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
 namespace Opm::Accelerator {

 template<class Scalar>
 RocmMatrix<Scalar>::
-RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
+RocmMatrix(int Nb_, 
+           int Mb_,
+           int nnzbs_,
+           unsigned int block_size_)
    : Nb(Nb_),
      Mb(Mb_),
      nnzbs(nnzbs_),
@ -62,7 +52,11 @@ RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)

 template <class Scalar>
 void RocmMatrix<Scalar>::
-upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {
+upload(Scalar *vals,
+       int *cols,
+       int *rows,
+       hipStream_t stream)
+{
    HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
    
    HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
@ -72,7 +66,9 @@ upload(Scalar *vals, int *cols, int *rows, hipStream_t stream) {

 template <class Scalar>
 void RocmMatrix<Scalar>::
-upload(Matrix<Scalar> *matrix, hipStream_t stream) {
+upload(Matrix<Scalar> *matrix,
+       hipStream_t stream)
+{
    if (block_size != 1) {
        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
    }
@ -82,7 +78,9 @@ upload(Matrix<Scalar> *matrix, hipStream_t stream) {

 template <class Scalar>
 void RocmMatrix<Scalar>::
-upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream) {
+upload(BlockedMatrix<Scalar> *matrix,
+       hipStream_t stream)
+{
    if (matrix->block_size != block_size) {
        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
    }
@ -99,7 +97,9 @@ RocmVector<Scalar>::RocmVector(int N)

 template <class Scalar>
 void RocmVector<Scalar>::
-upload(Scalar *vals, hipStream_t stream) {
+upload(Scalar *vals,
+       hipStream_t stream) 
+{
    HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));    
 }

--- a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
@ -21,24 +21,29 @@
 #define OPM_ROCMMATRIX_HEADER_INCLUDED

 #include <hip/hip_runtime_api.h>
-#include <hip/hip_version.h>

 namespace Opm::Accelerator {

 template<class Scalar> class Matrix;
 template<class Scalar> class BlockedMatrix;

-/// This struct resembles a csr matrix, only doubles are supported
-/// The matrix data is stored in OpenCL Buffers
+/// This struct resembles a csr matrix
 template<class Scalar>
 class RocmMatrix {
 public:

    RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);

-    void upload(Scalar *vals, int *cols, int *rows, hipStream_t stream);
-    void upload(Matrix<Scalar> *matrix, hipStream_t stream);
-    void upload(BlockedMatrix<Scalar> *matrix, hipStream_t stream);
+    void upload(Scalar *vals,
+                int *cols,
+                int *rows,
+                hipStream_t stream);
+    
+    void upload(Matrix<Scalar> *matrix,
+                hipStream_t stream);
+    
+    void upload(BlockedMatrix<Scalar> *matrix,
+                hipStream_t stream);

    Scalar* nnzValues;
    int* colIndices;
@ -54,8 +59,11 @@ public:

    RocmVector(int N);

-    void upload(Scalar *vals, hipStream_t stream);
-    void upload(Matrix<Scalar> *matrix, hipStream_t stream);
+    void upload(Scalar *vals,
+                hipStream_t stream);
+    
+    void upload(Matrix<Scalar> *matrix,
+                hipStream_t stream);

    Scalar* nnzValues;
    int size;
--- a/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp
@ -23,18 +23,20 @@
 #include <opm/common/ErrorMacros.hpp>

 #include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
-// #include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
 #include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>

 namespace Opm::Accelerator {

 template <class Scalar, unsigned int block_size>
 std::unique_ptr<rocsparsePreconditioner<Scalar,block_size> > rocsparsePreconditioner<Scalar,block_size>::
-create(PreconditionerType type, int verbosity) {
+create(PreconditionerType type,
+       int verbosity)
+{
    if (type == PreconditionerType::BILU0) {
        return std::make_unique<Opm::Accelerator::rocsparseBILU0<Scalar, block_size> >(verbosity);
-//     } else if (type == PreconditionerType::CPR) {
-//         return std::make_unique<Opm::Accelerator::rocsparseCPR<block_size> >(verbosity);
+    } else if (type == PreconditionerType::CPR) {
+        return std::make_unique<Opm::Accelerator::rocsparseCPR<Scalar, block_size> >(verbosity);
    } else {
        OPM_THROW(std::logic_error, "Invalid PreconditionerType");
    }
@ -42,14 +44,20 @@ create(PreconditionerType type, int verbosity) {

 template <class Scalar, unsigned int block_size>
 void rocsparsePreconditioner<Scalar, block_size>::
-set_matrix_analysis(rocsparse_mat_descr descr_L, rocsparse_mat_descr descr_U) {
+set_matrix_analysis(rocsparse_mat_descr descr_L,
+                    rocsparse_mat_descr descr_U)
+{
    descr_L = descr_L;
    descr_U = descr_U;
 }

 template <class Scalar, unsigned int block_size>
 void rocsparsePreconditioner<Scalar, block_size>::
-set_context(rocsparse_handle handle, rocsparse_direction dir, rocsparse_operation operation, hipStream_t stream) {
+set_context(rocsparse_handle handle,
+            rocsparse_direction dir,
+            rocsparse_operation operation,
+            hipStream_t stream)
+{
    this->handle = handle;
    this->dir = dir;
    this->operation = operation;
--- a/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
@ -52,7 +52,8 @@ public:
    
    virtual ~rocsparsePreconditioner() = default;

-    static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type, int verbosity);
+    static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type, 
+                                                                               int verbosity);

    // apply preconditioner, x = prec(y)
    virtual void apply(double& y, double& x) = 0;
@ -60,9 +61,14 @@ public:
    // create/update preconditioner, probably used every linear solve
    // the version with two params can be overloaded, if not, it will default to using the one param version
    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
-    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) = 0;
    
-    virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix, std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix, rocsparse_int *d_Arows, rocsparse_int *d_Acols) = 0;
+    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat,
+                                       BlockedMatrix<Scalar> *jacMat) = 0;
+    
+    virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                            std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                            rocsparse_int *d_Arows,
+                            rocsparse_int *d_Acols) = 0;
    
    virtual void copy_system_to_gpu(double *b)=0;

@ -70,8 +76,14 @@ public:
    /// \param[in] b              input vector, contains N values
    virtual void update_system_on_gpu(double *b)=0;
    
-    void set_matrix_analysis(rocsparse_mat_descr descr_L, rocsparse_mat_descr descr_U);
-    void set_context(rocsparse_handle handle, rocsparse_direction dir, rocsparse_operation operation, hipStream_t stream);
+    void set_matrix_analysis(rocsparse_mat_descr descr_L,
+                             rocsparse_mat_descr descr_U);
+    
+    void set_context(rocsparse_handle handle,
+                     rocsparse_direction dir,
+                     rocsparse_operation operation,
+                     hipStream_t stream);
+    
    void setJacMat(BlockedMatrix<Scalar> jacMat);
 };
 } //namespace Opm
--- a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
@ -43,50 +43,13 @@

 #include <opm/simulators/linalg/bda/Preconditioner.hpp>

-#include <hip/hip_runtime_api.h>
-#include <hip/hip_version.h>
+#include <opm/simulators/linalg/bda/Misc.hpp>

 #ifdef HIP_HAVE_CUDA_DEFINED
 #define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
 #undef HIP_HAVE_CUDA_DEFINED
 #endif

-#define HIP_CHECK(STAT)                                  \
-    do {                                                 \
-        const hipError_t stat = (STAT);                  \
-        if(stat != hipSuccess)                           \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::hip ";       \
-            oss << "error: " << hipGetErrorString(stat); \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCSPARSE_CHECK(STAT)                            \
-    do {                                                 \
-        const rocsparse_status stat = (STAT);            \
-        if(stat != rocsparse_status_success)             \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::rocsparse "; \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCBLAS_CHECK(STAT)                              \
-    do {                                                 \
-        const rocblas_status stat = (STAT);              \
-        if(stat != rocblas_status_success)               \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::rocblas ";   \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
 #include <cstddef>

 namespace Opm::Accelerator {
@ -462,7 +425,7 @@ copy_system_to_gpu(Scalar *b)
                             sizeof(Scalar) * nnz,
                             hipMemcpyHostToDevice, stream));
    HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
-    HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
+    HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar),
                             hipMemcpyHostToDevice, stream));
    
    prec->copy_system_to_gpu(d_Avals); 
--- a/opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp
@ -40,17 +40,9 @@
 #include <opm/common/ErrorMacros.hpp>

 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
-
+#include <opm/simulators/linalg/bda/Misc.hpp>
 #include <hip/hip_runtime.h>

-#define HIP_CHECK(stat)                               \
-    {                                                 \
-        if(stat != hipSuccess)                        \
-        {                                             \
-            OPM_THROW(std::logic_error, "HIP error"); \
-        }                                             \
-    }
-
 namespace Opm
 {