Merge pull request #5408 from BigDataAccelerate/cpr_rocsparse

rocsparse CPR initial version
2025-02-25 18:55:30 -06:00 · 2024-07-16 18:57:40 +02:00 · 2024-07-16 18:57:40 +02:00 · ca2ef490aa
commit ca2ef490aa
parent 6ce61a9018 a79d4aff5a
41 changed files with 3116 additions and 1054 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -247,29 +247,36 @@ endif()

 if(USE_BDA_BRIDGE)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp
+                                 opm/simulators/linalg/bda/CprCreation.cpp
+                                 opm/simulators/linalg/bda/Misc.cpp
                                 opm/simulators/linalg/bda/WellContributions.cpp
                                 opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
                                 opm/simulators/linalg/ISTLSolverBda.cpp)
  if(OPENCL_FOUND)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BILU0.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclBILU0.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/ChowPatelIlu.cpp)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BISAI.cpp)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/CPR.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclBISAI.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclCPR.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/opencl.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclKernels.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclPreconditioner.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclWellContributions.cpp)
  endif()
  if(ROCALUTION_FOUND)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp)
  endif()
  if(rocsparse_FOUND AND rocblas_FOUND)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseWellContributions.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/hipKernels.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp)
  endif()
  if(CUDA_FOUND)
    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
@ -660,25 +667,33 @@ if (USE_BDA_BRIDGE)
    opm/simulators/linalg/bda/BdaBridge.hpp
    opm/simulators/linalg/bda/BdaResult.hpp
    opm/simulators/linalg/bda/BdaSolver.hpp
-    opm/simulators/linalg/bda/opencl/BILU0.hpp
+    opm/simulators/linalg/bda/CprCreation.hpp
+    opm/simulators/linalg/bda/Preconditioner.hpp
+    opm/simulators/linalg/bda/Misc.hpp
+    opm/simulators/linalg/bda/opencl/openclBILU0.hpp
    opm/simulators/linalg/bda/BlockedMatrix.hpp
-    opm/simulators/linalg/bda/opencl/CPR.hpp
+    opm/simulators/linalg/bda/opencl/openclCPR.hpp
    opm/simulators/linalg/bda/cuda/cuda_header.hpp
    opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
    opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp
-    opm/simulators/linalg/bda/opencl/BISAI.hpp
+    opm/simulators/linalg/bda/opencl/openclBISAI.hpp
    opm/simulators/linalg/bda/Reorder.hpp
    opm/simulators/linalg/bda/opencl/opencl.hpp
    opm/simulators/linalg/bda/opencl/openclKernels.hpp
    opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp
-    opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+    opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp
    opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
    opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
    opm/simulators/linalg/bda/Matrix.hpp
    opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
-    opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
-    opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
-    opm/simulators/linalg/bda/rocsparseWellContributions.hpp
+    opm/simulators/linalg/bda/rocm/hipKernels.hpp
+    opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
+    opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
+    opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
    opm/simulators/linalg/bda/WellContributions.hpp
    opm/simulators/linalg/ISTLSolverBda.hpp
  )
--- a/opm/simulators/linalg/ISTLSolverBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverBda.cpp
@ -43,11 +43,11 @@

 #include <opm/grid/polyhedralgrid.hh>

-#if HAVE_OPENMP
 #include <thread>
-#include <omp.h>
-
 std::shared_ptr<std::thread> copyThread;
+
+#if HAVE_OPENMP
+#include <omp.h>
 #endif // HAVE_OPENMP

 namespace Opm::detail {
@ -113,9 +113,11 @@ apply(Vector& rhs,
        }
 #endif

-	bool use_multithreading = false;
+	bool use_multithreading = true;
 #if HAVE_OPENMP
-	use_multithreading = omp_get_max_threads() > 1;
+	// if user  manually sets --threads-per-process=1, do not use multithreading 
+        if (omp_get_max_threads() == 1)
+	    use_multithreading = false;
 #endif // HAVE_OPENMP

        if (numJacobiBlocks_ > 1) {
@ -123,9 +125,9 @@ apply(Vector& rhs,
 	      //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
 	      //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
 	      //the final entry in jacMat is correct.
-#if HAVE_OPENMP
+//#if HAVE_OPENMP
              copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
-#endif // HAVE_OPENMP
+//#endif // HAVE_OPENMP
 	    }
 	    else {
 	      this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -43,11 +43,11 @@
 #endif

 #if HAVE_ROCALUTION
-#include <opm/simulators/linalg/bda/rocalutionSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp>
 #endif

 #if HAVE_ROCSPARSE
-#include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
 #endif

 typedef Dune::InverseOperatorResult InverseOperatorResult;
@ -115,7 +115,7 @@ BdaBridge(std::string accelerator_mode_,
        use_gpu = true; // should be replaced by a 'use_bridge' boolean
        using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
        backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
-                                         tolerance, platformID, deviceID);
+                                         tolerance, platformID, deviceID, linsolver);
 #else
        OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
 #endif
--- a/opm/simulators/linalg/bda/CprCreation.cpp
+++ b/opm/simulators/linalg/bda/CprCreation.cpp
@ -0,0 +1,295 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <dune/common/shared_ptr.hh>
+
+#include <opm/simulators/linalg/PreconditionerFactory.hpp>
+#include <opm/simulators/linalg/PropertyTree.hpp>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/CprCreation.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+namespace Opm::Accelerator {
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <class Scalar, unsigned int block_size>
+CprCreation<Scalar, block_size>::CprCreation()
+{
+    diagIndices.resize(1);
+}
+
+template <class Scalar, unsigned int block_size>
+void CprCreation<Scalar, block_size>::
+create_preconditioner_amg(BlockedMatrix<Scalar> *mat_)
+{
+    mat = mat_;
+    cprNb = mat_->Nb;
+    cprnnzb = mat_->nnzbs;
+    cprN = cprNb * block_size;
+    cprnnz = cprnnzb * block_size * block_size;
+
+    coarse_vals.resize(cprnnzb);
+    coarse_x.resize(cprNb);
+    coarse_y.resize(cprNb);
+    weights.resize(cprN);
+
+    try{
+        Scalar rhs[] = {0, 0, 0};
+        rhs[pressure_idx] = 1;
+
+        // find diagonal index for each row
+        if (diagIndices[0].empty()) {
+            diagIndices[0].resize(cprNb);
+            for (int row = 0; row < cprNb; ++row) {
+                int start = mat->rowPointers[row];
+                int end = mat->rowPointers[row + 1];
+                auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
+                assert(candidate != mat->colIndices + end);
+                diagIndices[0][row] = candidate - mat->colIndices;
+            }
+        }
+
+        // calculate weights for each row
+        for (int row = 0; row < cprNb; ++row) {
+            // solve to find weights
+            Scalar *row_weights = weights.data() + block_size * row; // weights for this row
+            solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
+
+            // normalize weights for this row
+            Scalar abs_max = get_absmax(row_weights, block_size);
+            for(unsigned int i = 0; i < block_size; i++){
+                row_weights[i] /= abs_max;
+            }
+        }
+
+        // extract pressure
+        // transform blocks to scalars to create scalar linear system
+        for (int row = 0; row < cprNb; ++row) {
+            int start = mat->rowPointers[row];
+            int end = mat->rowPointers[row + 1];
+            for (int idx = start; idx < end; ++idx) {
+                Scalar *block = mat->nnzValues + idx * block_size * block_size;
+                Scalar *row_weights = weights.data() + block_size * row;
+                Scalar value = 0.0;
+                for (unsigned int i = 0; i < block_size; ++i) {
+                    value += block[block_size * i + pressure_idx] * row_weights[i];
+                }
+                coarse_vals[idx] = value;
+            }
+        }
+
+#if HAVE_MPI
+        using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
+#else
+        using Communication = Dune::Amg::SequentialInformation;
+#endif
+        using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
+        if (recalculate_aggregates) {
+            dune_coarse = std::make_unique<DuneMat>(cprNb, cprNb, cprnnzb, DuneMat::row_wise);
+
+//             typedef DuneMat::CreateIterator Iter;
+            using Iter = typename DuneMat::CreateIterator;
+
+            // setup sparsity pattern
+            for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
+                int start = mat->rowPointers[row.index()];
+                int end = mat->rowPointers[row.index() + 1];
+                for (int idx = start; idx < end; ++idx) {
+                    int col = mat->colIndices[idx];
+                    row.insert(col);
+                }
+            }
+
+            // set values
+            for (int row = 0; row < cprNb; ++row) {
+                int start = mat->rowPointers[row];
+                int end = mat->rowPointers[row + 1];
+                for (int idx = start; idx < end; ++idx) {
+                    int col = mat->colIndices[idx];
+                    (*dune_coarse)[row][col] = coarse_vals[idx];
+                }
+            }
+
+            dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
+            Dune::Amg::SequentialInformation seqinfo;
+            dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
+
+            Opm::PropertyTree property_tree;
+            property_tree.put("alpha", 0.333333333333);
+
+            // The matrix has a symmetric sparsity pattern, but the values are not symmetric
+            // Yet a SymmetricDependency is used in AMGCPR
+            // An UnSymmetricCriterion is also available
+            // using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
+            using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
+            using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
+            const Criterion c = Opm::AMGHelper<MatrixOperator,Dune::Amg::SequentialInformation,DuneMat,DuneVec>::criterion(property_tree);
+            num_pre_smooth_steps = c.getNoPreSmoothSteps();
+            num_post_smooth_steps = c.getNoPostSmoothSteps();
+
+            dune_amg->template build<OverlapFlags>(c);
+
+            analyzeHierarchy();
+            analyzeAggregateMaps();
+
+            recalculate_aggregates = false;
+        } else {
+            // update values of coarsest level in AMG
+            // this works because that level is actually a reference to the DuneMat held by dune_coarse
+            for (int row = 0; row < cprNb; ++row) {
+                int start = mat->rowPointers[row];
+                int end = mat->rowPointers[row + 1];
+                for (int idx = start; idx < end; ++idx) {
+                    int col = mat->colIndices[idx];
+                    (*dune_coarse)[row][col] = coarse_vals[idx];
+                }
+            }
+
+            // update the rest of the AMG hierarchy
+            dune_amg->recalculateGalerkin(OverlapFlags());
+            analyzeHierarchy();
+        }
+
+    } catch (const std::exception& ex) {
+        std::cerr << "Caught exception: " << ex.what() << std::endl;
+        throw ex;
+    }
+}
+
+template <class Scalar, unsigned int block_size>
+void CprCreation<Scalar, block_size>::
+analyzeHierarchy()
+{
+    const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
+
+    // store coarsest AMG level in umfpack format, also performs LU decomposition
+    umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
+
+    num_levels = dune_amg->levels();
+    level_sizes.resize(num_levels);
+    diagIndices.resize(num_levels);
+
+    Amatrices.reserve(num_levels);
+    Rmatrices.reserve(num_levels - 1);  // coarsest level does not need one
+    invDiags.reserve(num_levels);
+
+    Amatrices.clear();
+    invDiags.clear();
+
+    // matrixIter.dereference() returns MatrixAdapter
+    // matrixIter.dereference().getmat() returns BCRSMat
+    typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
+    for(int level = 0; level < num_levels; ++matrixIter, ++level) {
+        const auto& A = matrixIter.dereference().getmat();
+        level_sizes[level] = A.N();
+        diagIndices[level].reserve(A.N());
+
+        // extract matrix A
+        Amatrices.emplace_back(A.N(), A.nonzeroes());
+        // contiguous copy is not possible
+        // std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
+        // also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
+        int nnz_idx = 0;
+        const bool fillDiagIndices = diagIndices[level].empty();
+        for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
+            for (auto c = r->begin(); c != r->end(); ++c) {
+                Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
+                if (fillDiagIndices && r.index() == c.index()) {
+                    diagIndices[level].emplace_back(nnz_idx);
+                }
+                nnz_idx++;
+            }
+        }
+
+        Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
+
+        // compute inverse diagonal values for current level
+        invDiags.emplace_back(A.N());
+        for (unsigned int row = 0; row < A.N(); ++row) {
+            invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
+        }
+    }
+}
+
+
+template <class Scalar, unsigned int block_size>
+void CprCreation<Scalar, block_size>::
+analyzeAggregateMaps() 
+{
+    PcolIndices.resize(num_levels - 1);
+    Rmatrices.clear();
+
+    const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
+
+    typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
+    for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
+        typename DuneAmg::AggregatesMap *map = *mapIter;
+
+        Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
+        std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
+
+        // get indices for each row of P and R
+        std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
+        PcolIndices[level].resize(level_sizes[level]);
+
+        using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
+        for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
+            if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
+                const long int diff = ai - map->begin();
+                PcolIndices[level][diff] = *ai;
+                indicesR[*ai].emplace_back(diff);
+            }
+        }
+
+        int col_idx = 0;
+        // set sparsity pattern of R
+        Rmatrices.back().rowPointers[0] = 0;
+        for (unsigned int i = 0; i < indicesR.size(); ++i) {
+            Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
+            for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
+                Rmatrices.back().colIndices[col_idx++] = *it;
+            }
+        }
+    }
+}
+
+#define INSTANCE_TYPE(T)                \
+    template class CprCreation<T,1>; \
+    template class CprCreation<T,2>; \
+    template class CprCreation<T,3>; \
+    template class CprCreation<T,4>; \
+    template class CprCreation<T,5>; \
+    template class CprCreation<T,6>;
+
+INSTANCE_TYPE(double)
+
+} // namespace Opm
+
+
--- a/opm/simulators/linalg/bda/CprCreation.hpp
+++ b/opm/simulators/linalg/bda/CprCreation.hpp
@ -0,0 +1,86 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_CPRCREATION_HPP
+#define OPM_CPRCREATION_HPP
+
+#include <mutex>
+
+#include <dune/istl/paamg/matrixhierarchy.hh>
+#include <dune/istl/umfpack.hh>
+
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/Preconditioner.hpp>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class BlockedMatrix;
+
+/// This class implements a Constrained Pressure Residual (CPR) preconditioner
+template <class Scalar, unsigned int block_size>
+class CprCreation
+{
+    int cprN;
+    int cprNb;
+    int cprnnz;
+    int cprnnzb;
+
+public:     
+    CprCreation();
+
+protected:
+    
+    int num_levels;
+    std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
+    std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
+    std::vector<std::vector<Scalar> > invDiags; // inverse of diagonal of Amatrices
+    
+    BlockedMatrix<Scalar> *mat = nullptr;    // input matrix, blocked
+
+    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
+    using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
+    using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
+    using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
+    std::unique_ptr<DuneAmg> dune_amg;
+    std::unique_ptr<DuneMat> dune_coarse;       // extracted pressure matrix, finest level in AMG hierarchy
+    std::shared_ptr<MatrixOperator> dune_op;    // operator, input to Dune AMG
+    std::vector<int> level_sizes;               // size of each level in the AMG hierarchy
+    std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
+    Dune::UMFPack<DuneMat> umfpack;             // dune/istl/umfpack object used to solve the coarsest level of AMG
+    bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
+    bool recalculate_aggregates = true;         // only rerecalculate if true
+    const int pressure_idx = 1;                 // hardcoded to mimic OPM
+    unsigned num_pre_smooth_steps;              // number of Jacobi smooth steps before restriction
+    unsigned num_post_smooth_steps;             // number of Jacobi smooth steps after prolongation
+
+    // Analyze the AMG hierarchy build by Dune
+    void analyzeHierarchy();
+
+    // Analyze the aggregateMaps from the AMG hierarchy
+    // These can be reused, so only use when recalculate_aggregates is true
+    void analyzeAggregateMaps();
+
+    void create_preconditioner_amg(BlockedMatrix<Scalar> *mat);
+};
+
+} // namespace Opm
+
+#endif
+
--- a/opm/simulators/linalg/bda/Misc.cpp
+++ b/opm/simulators/linalg/bda/Misc.cpp
@ -0,0 +1,62 @@
+#include <cmath>
+#include <algorithm>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+namespace Opm::Accelerator {
+
+// divide A by B, and round up: return (int)ceil(A/B)
+unsigned int ceilDivision(const unsigned int A,
+                          const unsigned int B)
+{
+    return A / B + (A % B > 0);
+}
+
+// return the absolute value of the N elements for which the absolute value is highest
+template<class Scalar>
+Scalar get_absmax(const Scalar* data,
+                  const int N)
+{
+    return std::abs(*std::max_element(data, data + N,
+                                      [](Scalar a, Scalar b)
+                                      { return std::fabs(a) < std::fabs(b); }));
+}
+
+// solve A^T * x = b
+template<class Scalar>
+void solve_transposed_3x3(const Scalar* A, 
+                          const Scalar* b,
+                          Scalar* x)
+{
+    const int B = 3;
+    // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
+    Scalar t4  = A[0+0*B] * A[1+1*B];
+    Scalar t6  = A[0+0*B] * A[1+2*B];
+    Scalar t8  = A[0+1*B] * A[1+0*B];
+    Scalar t10 = A[0+2*B] * A[1+0*B];
+    Scalar t12 = A[0+1*B] * A[2+0*B];
+    Scalar t14 = A[0+2*B] * A[2+0*B];
+
+    Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
+               t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant
+
+    x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
+          - b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
+          + b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
+
+    x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
+          - A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
+          + A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
+
+    x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
+          - A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
+          + A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
+}
+
+#define INSTANTIATE_TYPE(T)\                                 
+template void solve_transposed_3x3<T>(const T* A, const T* b, T* x);\
+template T get_absmax<T>(const T* data, const int N);
+
+INSTANTIATE_TYPE(double)
+
+}
--- a/opm/simulators/linalg/bda/Misc.hpp
+++ b/opm/simulators/linalg/bda/Misc.hpp
@ -0,0 +1,62 @@
+#ifndef OPM_MISC_HPP
+#define OPM_MISC_HPP
+
+#ifdef HAVE_ROCSPARSE
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_version.h>
+#include <sstream>
+
+#define HIP_CHECK(STAT)                                  \
+    do {                                                 \
+        const hipError_t stat = (STAT);                  \
+        if(stat != hipSuccess)                           \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::hip ";       \
+            oss << "error: " << hipGetErrorString(stat); \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+#define ROCSPARSE_CHECK(STAT)                            \
+    do {                                                 \
+        const rocsparse_status stat = (STAT);            \
+        if(stat != rocsparse_status_success)             \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::rocsparse "; \
+            oss << "error: " << stat;                    \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+
+#define ROCBLAS_CHECK(STAT)                              \
+    do {                                                 \
+        const rocblas_status stat = (STAT);              \
+        if(stat != rocblas_status_success)               \
+        {                                                \
+            std::ostringstream oss;                      \
+            oss << "rocsparseSolverBackend::rocblas ";   \
+            oss << "error: " << stat;                    \
+            OPM_THROW(std::logic_error, oss.str());      \
+        }                                                \
+    } while(0)
+#endif
+
+namespace Opm::Accelerator {
+
+unsigned int ceilDivision(const unsigned int A,
+                          const unsigned int B);
+
+template<class Scalar>
+Scalar get_absmax(const Scalar *data,
+                  const int N);
+
+template<class Scalar>
+void solve_transposed_3x3(const Scalar *A,
+                          const Scalar *b,
+                          Scalar *x);
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@ -1,5 +1,5 @@
 /*
-  Copyright 2021 Equinor ASA
+  Copyright 2024 Equinor ASA

  This file is part of the Open Porous Media project (OPM).

@ -20,12 +20,18 @@
 #ifndef OPM_PRECONDITIONER_HEADER_INCLUDED
 #define OPM_PRECONDITIONER_HEADER_INCLUDED

+#if HAVE_OPENCL
 #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-
-#include <memory>
+#endif

 namespace Opm::Accelerator {

+enum PreconditionerType {
+    BILU0,
+    CPR,
+    BISAI
+};
+
 template<class Scalar> class BlockedMatrix;

 template<class Scalar, unsigned int block_size>
@ -38,47 +44,38 @@ protected:
    int nnzb = 0;    // number of blocks of the matrix
    int verbosity = 0;

-    std::shared_ptr<cl::Context> context;
-    std::shared_ptr<cl::CommandQueue> queue;
-    std::vector<cl::Event> events;
-    cl_int err;
-
    Preconditioner(int verbosity_) :
    verbosity(verbosity_)
    {};

 public:
-    enum class Type {
-        BILU0,
-        CPR,
-        BISAI
-    };

-    static std::unique_ptr<Preconditioner> create(Type type,
+    virtual ~Preconditioner() = default;
+    
+    static std::unique_ptr<Preconditioner> create(PreconditionerType type,
                                                  bool opencl_ilu_parallel,
                                                  int verbosity);

-    virtual ~Preconditioner() = default;
-
-    // nested Preconditioners might need to override this
-    virtual void setOpencl(std::shared_ptr<cl::Context>& context,
-                           std::shared_ptr<cl::CommandQueue>& queue);
-
+#if HAVE_OPENCL
    // apply preconditioner, x = prec(y)
    virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
+#endif
+
+    // apply preconditioner, x = prec(y)
+    virtual void apply(Scalar& y, Scalar& x) = 0;

    // analyze matrix, e.g. the sparsity pattern
    // probably only called once
    // the version with two params can be overloaded, if not, it will default to using the one param version
    virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat) = 0;
    virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat,
-                                BlockedMatrix<Scalar>* jacMat);
+                                BlockedMatrix<Scalar>* jacMat) = 0;

    // create/update preconditioner, probably used every linear solve
    // the version with two params can be overloaded, if not, it will default to using the one param version
    virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat) = 0;
    virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat,
-                                       BlockedMatrix<Scalar>* jacMat);
+                                       BlockedMatrix<Scalar>* jacMat) = 0;
 };

 } // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -34,7 +34,7 @@
 #endif

 #ifdef HAVE_ROCSPARSE
-#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp>
 #endif

 namespace Opm {
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -38,10 +38,11 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-#if HAVE_OPENMP
 #include <thread>
-#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
+
+#if HAVE_OPENMP
+#include <omp.h>
 #endif // HAVE_OPENMP

 namespace Opm::Accelerator {
@ -342,11 +343,17 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
 #else
    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
-    if (useJacMatrix) {
+
+    bool use_multithreading = true;
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+    if(omp_get_max_threads() == 1)
+        use_multithreading = false;
 #endif
+
+    if (useJacMatrix) {
+        if(use_multithreading)
+            copyThread->join();
+
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
                        cudaMemcpyHostToDevice, stream);
@ -399,12 +406,17 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
 #else
    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
-    if (useJacMatrix) {
+
+    bool use_multithreading = true;
 #if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
-           copyThread->join();
-        }
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
 #endif
+
+    if (useJacMatrix) {
+        if (use_multithreading)
+            copyThread->join();
+
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, 
                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
                        cudaMemcpyHostToDevice, stream);
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@ -1,580 +0,0 @@
-/*
-  Copyright 2021 Equinor ASA
-
-  This file is part of the Open Porous Media project (OPM).
-
-  OPM is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  OPM is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <config.h>
-#include <opm/common/TimingMacros.hpp>
-#include <opm/common/OpmLog/OpmLog.hpp>
-#include <opm/common/ErrorMacros.hpp>
-#include <dune/common/timer.hh>
-
-#include <dune/common/shared_ptr.hh>
-
-#include <opm/simulators/linalg/PreconditionerFactory.hpp>
-#include <opm/simulators/linalg/PropertyTree.hpp>
-
-#include <opm/simulators/linalg/bda/BdaBridge.hpp>
-#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
-#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
-#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
-#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
-
-namespace Opm::Accelerator {
-
-using Dune::Timer;
-
-template<class Scalar, unsigned int block_size>
-CPR<Scalar,block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
-    : Base(verbosity_)
-    , opencl_ilu_parallel(opencl_ilu_parallel_)
-{
-    bilu0 = std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
-    diagIndices.resize(1);
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::
-setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
-{
-    context = context_;
-    queue = queue_;
-
-    bilu0->setOpencl(context, queue);
-}
-
-template<class Scalar, unsigned int block_size>
-bool CPR<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat_)
-{
-    this->Nb = mat_->Nb;
-    this->nnzb = mat_->nnzbs;
-    this->N = Nb * block_size;
-    this->nnz = nnzb * block_size * block_size;
-
-    bool success = bilu0->analyze_matrix(mat_);
-    mat = mat_;
-    return success;
-}
-
-template<class Scalar, unsigned int block_size>
-bool CPR<Scalar,block_size>::
-analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
-{
-    this->Nb = mat_->Nb;
-    this->nnzb = mat_->nnzbs;
-    this->N = Nb * block_size;
-    this->nnz = nnzb * block_size * block_size;
-
-    bool success = bilu0->analyze_matrix(mat_, jacMat);
-    mat = mat_;
-
-    return success;
-}
-
-template<class Scalar, unsigned int block_size>
-bool CPR<Scalar,block_size>::
-create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
-{
-    Dune::Timer t_bilu0;
-    bool result = bilu0->create_preconditioner(mat_, jacMat);
-    if (verbosity >= 3) {
-        std::ostringstream out;
-        out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
-        OpmLog::info(out.str());
-    }
-
-    Dune::Timer t_amg;
-    create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
-    if (verbosity >= 3) {
-        std::ostringstream out;
-        out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
-        OpmLog::info(out.str());
-    }
-    return result;
-}
-
-template<class Scalar, unsigned int block_size>
-bool CPR<Scalar,block_size>::
-create_preconditioner(BlockedMatrix<Scalar>* mat_)
-{
-    Dune::Timer t_bilu0;
-    bool result = bilu0->create_preconditioner(mat_);
-    if (verbosity >= 3) {
-        std::ostringstream out;
-        out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
-        OpmLog::info(out.str());
-    }
-
-    Dune::Timer t_amg;
-    create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
-    if (verbosity >= 3) {
-        std::ostringstream out;
-        out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
-        OpmLog::info(out.str());
-    }
-    return result;
-}
-
-// return the absolute value of the N elements for which the absolute value is highest
-template<class Scalar>
-Scalar get_absmax(const Scalar* data, const int N)
-{
-    return std::abs(*std::max_element(data, data + N,
-                                      [](Scalar a, Scalar b)
-                                      { return std::fabs(a) < std::fabs(b); }));
-}
-
-// solve A^T * x = b
-template<class Scalar>
-void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
-{
-    const int B = 3;
-    // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
-    Scalar t4  = A[0+0*B] * A[1+1*B];
-    Scalar t6  = A[0+0*B] * A[1+2*B];
-    Scalar t8  = A[0+1*B] * A[1+0*B];
-    Scalar t10 = A[0+2*B] * A[1+0*B];
-    Scalar t12 = A[0+1*B] * A[2+0*B];
-    Scalar t14 = A[0+2*B] * A[2+0*B];
-
-    Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
-               t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant
-
-    x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
-          - b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
-          + b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
-
-    x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
-          - A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
-          + A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
-
-    x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
-          - A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
-          + A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar, block_size>::init_opencl_buffers()
-{
-    d_Amatrices.reserve(num_levels);
-    d_Rmatrices.reserve(num_levels - 1);
-    d_invDiags.reserve(num_levels - 1);
-    for (Matrix<Scalar>& m : Amatrices) {
-        d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
-    }
-    for (Matrix<Scalar>& m : Rmatrices) {
-        d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
-        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
-        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
-
-        d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
-        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
-        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
-    }
-    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
-    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
-    d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), Nb, Nb, nnzb, block_size);
-    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
-    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::opencl_upload()
-{
-    d_mat->upload(queue.get(), mat);
-
-    err = CL_SUCCESS;
-    events.resize(2 * Rmatrices.size() + 1);
-    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
-                                     sizeof(Scalar) * N, weights.data(), nullptr, &events[0]);
-    for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
-        d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
-
-        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
-                                         sizeof(Scalar) * Amatrices[i].N, invDiags[i].data(),
-                                         nullptr, &events[2*i+1]);
-        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
-                                         sizeof(int) * Amatrices[i].N, PcolIndices[i].data(),
-                                         nullptr, &events[2*i+2]);
-    }
-    cl::WaitForEvents(events);
-    events.clear();
-    if (err != CL_SUCCESS) {
-        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
-    }
-    for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
-        d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
-    }
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::
-create_preconditioner_amg(BlockedMatrix<Scalar>* mat_)
-{
-    this->mat = mat_;
-
-    coarse_vals.resize(nnzb);
-    coarse_x.resize(Nb);
-    coarse_y.resize(Nb);
-    weights.resize(N);
-
-    try {
-        Scalar rhs[] = {0, 0, 0};
-        rhs[pressure_idx] = 1;
-
-        // find diagonal index for each row
-        if (diagIndices[0].empty()) {
-            diagIndices[0].resize(Nb);
-            for (int row = 0; row < Nb; ++row) {
-                int start = mat->rowPointers[row];
-                int end = mat->rowPointers[row + 1];
-                auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
-                assert(candidate != mat->colIndices + end);
-                diagIndices[0][row] = candidate - mat->colIndices;
-            }
-        }
-
-        // calculate weights for each row
-        for (int row = 0; row < Nb; ++row) {
-            // solve to find weights
-            Scalar* row_weights = weights.data() + block_size * row; // weights for this row
-            solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
-
-            // normalize weights for this row
-            Scalar abs_max = get_absmax(row_weights, block_size);
-            for (unsigned int i = 0; i < block_size; i++) {
-                row_weights[i] /= abs_max;
-            }
-        }
-
-        // extract pressure
-        // transform blocks to scalars to create scalar linear system
-        for (int row = 0; row < Nb; ++row) {
-            int start = mat->rowPointers[row];
-            int end = mat->rowPointers[row + 1];
-            for (int idx = start; idx < end; ++idx) {
-                Scalar* block = mat->nnzValues + idx * block_size * block_size;
-                Scalar* row_weights = weights.data() + block_size * row;
-                Scalar value = 0.0;
-                for (unsigned int i = 0; i < block_size; ++i) {
-                    value += block[block_size * i + pressure_idx] * row_weights[i];
-                }
-                coarse_vals[idx] = value;
-            }
-        }
-
-#if HAVE_MPI
-        using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
-#else
-        using Communication = Dune::Amg::SequentialInformation;
-#endif
-        using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
-        if (recalculate_aggregates) {
-            dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
-
-            using Iter = typename DuneMat::CreateIterator;
-
-            // setup sparsity pattern
-            for (Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row) {
-                int start = mat->rowPointers[row.index()];
-                int end = mat->rowPointers[row.index() + 1];
-                for (int idx = start; idx < end; ++idx) {
-                    int col = mat->colIndices[idx];
-                    row.insert(col);
-                }
-            }
-
-            // set values
-            for (int row = 0; row < Nb; ++row) {
-                int start = mat->rowPointers[row];
-                int end = mat->rowPointers[row + 1];
-                for (int idx = start; idx < end; ++idx) {
-                    int col = mat->colIndices[idx];
-                    (*dune_coarse)[row][col] = coarse_vals[idx];
-                }
-            }
-
-            dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
-            Dune::Amg::SequentialInformation seqinfo;
-            dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
-
-            PropertyTree property_tree;
-            property_tree.put("alpha", 0.333333333333);
-
-            // The matrix has a symmetric sparsity pattern, but the values are not symmetric
-            // Yet a SymmetricDependency is used in AMGCPR
-            // An UnSymmetricCriterion is also available
-            // using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
-            using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
-            using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
-            const Criterion c = Opm::AMGHelper<MatrixOperator,Dune::Amg::SequentialInformation,DuneMat,DuneVec>::criterion(property_tree);
-            num_pre_smooth_steps = c.getNoPreSmoothSteps();
-            num_post_smooth_steps = c.getNoPostSmoothSteps();
-
-            dune_amg->template build<OverlapFlags>(c);
-
-            analyzeHierarchy();
-            analyzeAggregateMaps();
-
-            recalculate_aggregates = false;
-        } else {
-            // update values of coarsest level in AMG
-            // this works because that level is actually a reference to the DuneMat held by dune_coarse
-            for (int row = 0; row < Nb; ++row) {
-                int start = mat->rowPointers[row];
-                int end = mat->rowPointers[row + 1];
-                for (int idx = start; idx < end; ++idx) {
-                    int col = mat->colIndices[idx];
-                    (*dune_coarse)[row][col] = coarse_vals[idx];
-                }
-            }
-
-            // update the rest of the AMG hierarchy
-            dune_amg->recalculateGalerkin(OverlapFlags());
-            analyzeHierarchy();
-        }
-
-        // initialize OpenclMatrices and Buffers if needed
-        auto init_func = std::bind(&CPR::init_opencl_buffers, this);
-        std::call_once(opencl_buffers_allocated, init_func);
-
-        // upload matrices and vectors to GPU
-        opencl_upload();
-
-    } catch (const std::exception& ex) {
-        std::cerr << "Caught exception: " << ex.what() << std::endl;
-        throw ex;
-    }
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::analyzeHierarchy()
-{
-    const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
-
-    // store coarsest AMG level in umfpack format, also performs LU decomposition
-    umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
-
-    num_levels = dune_amg->levels();
-    level_sizes.resize(num_levels);
-    diagIndices.resize(num_levels);
-
-    Amatrices.reserve(num_levels);
-    Rmatrices.reserve(num_levels - 1);  // coarsest level does not need one
-    invDiags.reserve(num_levels);
-
-    Amatrices.clear();
-    invDiags.clear();
-
-    // matrixIter.dereference() returns MatrixAdapter
-    // matrixIter.dereference().getmat() returns BCRSMat
-    typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
-    for (int level = 0; level < num_levels; ++matrixIter, ++level) {
-        const auto& A = matrixIter.dereference().getmat();
-        level_sizes[level] = A.N();
-        diagIndices[level].reserve(A.N());
-
-        // extract matrix A
-        Amatrices.emplace_back(A.N(), A.nonzeroes());
-        // contiguous copy is not possible
-        // std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
-        // also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
-        int nnz_idx = 0;
-        const bool fillDiagIndices = diagIndices[level].empty();
-        for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
-            for (auto c = r->begin(); c != r->end(); ++c) {
-                Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
-                if (fillDiagIndices && r.index() == c.index()) {
-                    diagIndices[level].emplace_back(nnz_idx);
-                }
-                nnz_idx++;
-            }
-        }
-
-        BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers,
-                                                                    Amatrices.back().colIndices);
-
-        // compute inverse diagonal values for current level
-        invDiags.emplace_back(A.N());
-        for (unsigned int row = 0; row < A.N(); ++row) {
-            invDiags.back()[row] = 1.0 / Amatrices.back().nnzValues[diagIndices[level][row]];
-        }
-    }
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::analyzeAggregateMaps()
-{
-    PcolIndices.resize(num_levels - 1);
-    Rmatrices.clear();
-
-    const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
-
-    typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
-    for (int level = 0; level < num_levels - 1; ++mapIter, ++level) {
-        typename DuneAmg::AggregatesMap* map = *mapIter;
-
-        Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
-        std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
-
-        // get indices for each row of P and R
-        std::vector<std::vector<unsigned>> indicesR(level_sizes[level+1]);
-        PcolIndices[level].resize(level_sizes[level]);
-
-        using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
-        for (AggregateIterator ai = map->begin(); ai != map->end(); ++ai) {
-            if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
-                const long int diff = ai - map->begin();
-                PcolIndices[level][diff] = *ai;
-                indicesR[*ai].emplace_back(diff);
-            }
-        }
-
-        int col_idx = 0;
-        // set sparsity pattern of R
-        Rmatrices.back().rowPointers[0] = 0;
-        for (unsigned int i = 0; i < indicesR.size(); ++i) {
-            Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
-            for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
-                Rmatrices.back().colIndices[col_idx++] = *it;
-            }
-        }
-    }
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
-{
-    OpenclMatrix<Scalar>* A = &d_Amatrices[level];
-    OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
-    int Ncur = A->Nb;
-
-    if (level == num_levels - 1) {
-        // solve coarsest level
-        std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
-
-        events.resize(1);
-        err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
-                                       sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
-        }
-
-        // solve coarsest level using umfpack
-        umfpack.apply(h_x.data(), h_y.data());
-
-        events.resize(1);
-        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
-                                        sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
-        }
-        return;
-    }
-    int Nnext = d_Amatrices[level+1].Nb;
-
-    cl::Buffer& t = d_t[level];
-    cl::Buffer& f = d_f[level];
-    cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
-
-    // presmooth
-    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
-    for (unsigned i = 0; i < num_pre_smooth_steps; ++i) {
-        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
-    }
-
-    // move to coarser level
-    OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-    OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
-    amg_cycle_gpu(level + 1, f, u);
-    OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
-
-    // postsmooth
-    for (unsigned i = 0; i < num_post_smooth_steps; ++i) {
-        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
-                                        x, y, t, Ncur, 1);
-        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
-    }
-}
-
-// x = prec(y)
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
-{
-    // 0-initialize u and x vectors
-    events.resize(d_u.size() + 1);
-    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
-                                   sizeof(Scalar) * Nb, nullptr, &events[0]);
-    for (unsigned int i = 0; i < d_u.size(); ++i) {
-        err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
-                                        sizeof(Scalar) * Rmatrices[i].N, nullptr, &events[i + 1]);
-    }
-    cl::WaitForEvents(events);
-    events.clear();
-    if (err != CL_SUCCESS) {
-        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
-    }
-
-    OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
-                                    d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
-    OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
-
-    amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
-
-    OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
-}
-
-template<class Scalar, unsigned int block_size>
-void CPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
-{
-    Dune::Timer t_bilu0;
-    bilu0->apply(y, x);
-    if (verbosity >= 4) {
-        std::ostringstream out;
-        out << "CPR apply bilu0(): " << t_bilu0.stop() << " s";
-        OpmLog::info(out.str());
-    }
-
-    Dune::Timer t_amg;
-    apply_amg(y, x);
-    if (verbosity >= 4) {
-        std::ostringstream out;
-        out << "CPR apply amg(): " << t_amg.stop() << " s";
-        OpmLog::info(out.str());
-    }
-}
-
-#define INSTANCE_TYPE(T)     \
-    template class CPR<T,1>; \
-    template class CPR<T,2>; \
-    template class CPR<T,3>; \
-    template class CPR<T,4>; \
-    template class CPR<T,5>; \
-    template class CPR<T,6>;
-
-INSTANCE_TYPE(double)
-
-} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@ -1,87 +0,0 @@
-/*
-  Copyright 2021 Equinor ASA
-
-  This file is part of the Open Porous Media project (OPM).
-
-  OPM is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  OPM is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <config.h>
-#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
-
-#include <opm/common/TimingMacros.hpp>
-#include <opm/common/ErrorMacros.hpp>
-
-#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
-#include <opm/simulators/linalg/bda/opencl/BISAI.hpp>
-#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
-
-#include <memory>
-#include <string>
-
-namespace Opm::Accelerator {
-
-template<class Scalar, unsigned int block_size>
-void Preconditioner<Scalar,block_size>::
- setOpencl(std::shared_ptr<cl::Context>& context_,
-           std::shared_ptr<cl::CommandQueue>& queue_)
-{
-    context = context_;
-    queue = queue_;
-}
-
-template<class Scalar, unsigned int block_size>
-std::unique_ptr<Preconditioner<Scalar,block_size>>
-Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
-{
-    switch (type ) {
-    case Type::BILU0:
-        return std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
-    case Type::CPR:
-        return std::make_unique<CPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
-    case Type::BISAI:
-        return std::make_unique<BISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
-    }
-
-    OPM_THROW(std::logic_error,
-              "Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
-}
-
-template<class Scalar, unsigned int block_size>
-bool Preconditioner<Scalar,block_size>::
-analyze_matrix(BlockedMatrix<Scalar>* mat,
-               [[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
-{
-    return analyze_matrix(mat);
-}
-
-template<class Scalar, unsigned int block_size>
-bool Preconditioner<Scalar,block_size>::
-create_preconditioner(BlockedMatrix<Scalar>* mat,
-                      [[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
-{
-    return create_preconditioner(mat);
-}
-
-#define INSTANCE_TYPE(T)                \
-    template class Preconditioner<T,1>; \
-    template class Preconditioner<T,2>; \
-    template class Preconditioner<T,3>; \
-    template class Preconditioner<T,4>; \
-    template class Preconditioner<T,5>; \
-    template class Preconditioner<T,6>;
-
-INSTANCE_TYPE(double)
-
-} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclBILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclBILU0.cpp
@ -24,19 +24,26 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <dune/common/timer.hh>

-#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
 #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>
 #include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>

 #include <sstream>

+#include <thread>
+extern std::shared_ptr<std::thread> copyThread;
+
+#if HAVE_OPENMP
+#include <omp.h>
+#endif //HAVE_OPENMP
+
 namespace Opm::Accelerator {

 using Dune::Timer;

 template<class Scalar, unsigned int block_size>
-BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
+openclBILU0<Scalar,block_size>::openclBILU0(bool opencl_ilu_parallel_, int verbosity_)
    : Base(verbosity_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
@ -46,13 +53,13 @@ BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
 }

 template<class Scalar, unsigned int block_size>
-bool BILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
+bool openclBILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }

 template<class Scalar, unsigned int block_size>
-bool BILU0<Scalar,block_size>::
+bool openclBILU0<Scalar,block_size>::
 analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;
@ -80,7 +87,7 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
                        CSCRowIndices.data(), CSCColPointers.data(), Nb);
        if(verbosity >= 3){
            std::ostringstream out;
-            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+            out << "openclBILU0 convert CSR to CSC: " << t_convert.stop() << " s";
            OpmLog::info(out.str());
        }
    } else {
@ -105,11 +112,11 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
    }

    if (verbosity >= 1) {
-        out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+        out << "openclBILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
    }

 #if CHOW_PATEL
-    out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
+    out << "openclBILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
 #endif
    OpmLog::info(out.str());

@ -169,25 +176,35 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
    events.clear();
    if (err != CL_SUCCESS) {
        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+        OPM_THROW(std::logic_error, "openclBILU0 OpenCL enqueueWriteBuffer error");
    }

    return true;
 }

 template<class Scalar, unsigned int block_size>
-bool BILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
+bool openclBILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }

 template<class Scalar, unsigned int block_size>
-bool BILU0<Scalar,block_size>::
+bool openclBILU0<Scalar,block_size>::
 create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;

    auto *matToDecompose = jacMat ? jacMat : mat;
+    bool use_multithreading = true;
+
+#if HAVE_OPENMP
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
+#endif
+
+    if (jacMat && use_multithreading) {
+        copyThread->join();
+    }

    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    Timer t_copy;
@ -196,7 +213,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)

    if (verbosity >= 3){
        std::ostringstream out;
-        out << "BILU0 memcpy: " << t_copy.stop() << " s";
+        out << "openclBILU0 memcpy: " << t_copy.stop() << " s";
        OpmLog::info(out.str());
    }

@ -239,12 +256,12 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
    events.clear();
    if (err != CL_SUCCESS) {
        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+        OPM_THROW(std::logic_error, "openclBILU0 OpenCL enqueueWriteBuffer error");
    }

    if (verbosity >= 3) {
        std::ostringstream out;
-        out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
+        out << "openclBILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
        OpmLog::info(out.str());
    }

@ -264,7 +281,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)

    if (verbosity >= 3) {
        queue->finish();
-        out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+        out << "openclBILU0 decomposition: " << t_decomposition.stop() << " s";
        OpmLog::info(out.str());
    }
 #endif // CHOW_PATEL
@ -276,7 +293,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
 template<class Scalar, unsigned int block_size>
-void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+void openclBILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
 {
    const Scalar relaxation = 0.9;
    cl::Event event;
@ -311,18 +328,18 @@ void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)

    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "BILU0 apply: " << t_apply.stop() << " s";
+        out << "openclBILU0 apply: " << t_apply.stop() << " s";
        OpmLog::info(out.str());
    }
 }

 #define INSTANCE_TYPE(T)       \
-    template class BILU0<T,1>; \
-    template class BILU0<T,2>; \
-    template class BILU0<T,3>; \
-    template class BILU0<T,4>; \
-    template class BILU0<T,5>; \
-    template class BILU0<T,6>;
+    template class openclBILU0<T,1>; \
+    template class openclBILU0<T,2>; \
+    template class openclBILU0<T,3>; \
+    template class openclBILU0<T,4>; \
+    template class openclBILU0<T,5>; \
+    template class openclBILU0<T,6>;

 INSTANCE_TYPE(double)

--- a/opm/simulators/linalg/bda/opencl/openclBILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclBILU0.hpp
@ -17,15 +17,15 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef BILU0_HPP
-#define BILU0_HPP
+#ifndef OPM_OPENCLBILU0_HPP
+#define OPM_OPENCLBILU0_HPP

 #include <mutex>

 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

 #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
 #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>


@ -35,9 +35,9 @@ namespace Opm::Accelerator {
 /// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
 /// The preconditioner is applied via two exact triangular solves
 template<class Scalar, unsigned int block_size>
-class BILU0 : public Preconditioner<Scalar,block_size>
+class openclBILU0 : public openclPreconditioner<Scalar,block_size>
 {
-    using Base = Preconditioner<Scalar,block_size>;
+    using Base = openclPreconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -87,7 +87,7 @@ private:

 public:

-    BILU0(bool opencl_ilu_parallel, int verbosity);
+    openclBILU0(bool opencl_ilu_parallel, int verbosity);

    // analysis, extract parallelism if specified
    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
@ -103,6 +103,7 @@ public:
    // via Lz = y
    // and Ux = z
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
+    void apply(Scalar& y, Scalar& x) {}

    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
    get_preconditioner_structure()
--- a/opm/simulators/linalg/bda/opencl/openclBISAI.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclBISAI.cpp
@ -26,8 +26,8 @@

 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
-#include <opm/simulators/linalg/bda/opencl/BISAI.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBISAI.hpp>
 #include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
 #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // disable BISAI if ChowPatel is selected
@ -40,17 +40,17 @@ using Opm::OpmLog;
 using Dune::Timer;

 template<class Scalar, unsigned int block_size>
-BISAI<Scalar,block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_)
+openclBISAI<Scalar,block_size>::openclBISAI(bool opencl_ilu_parallel_, int verbosity_)
    : Base(verbosity_)
 {
 #if CHOW_PATEL
    OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
 #endif
-    bilu0 = std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
+    bilu0 = std::make_unique<openclBILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
 }

 template<class Scalar, unsigned int block_size>
-void BISAI<Scalar,block_size>::
+void openclBISAI<Scalar,block_size>::
 setOpencl(std::shared_ptr<cl::Context>& context_,
          std::shared_ptr<cl::CommandQueue>& queue_)
 {
@ -79,13 +79,13 @@ buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices
 }

 template<class Scalar, unsigned int block_size>
-bool BISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
+bool openclBISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }

 template<class Scalar, unsigned int block_size>
-bool BISAI<Scalar,block_size>::
+bool openclBISAI<Scalar,block_size>::
 analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;
@ -108,7 +108,7 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 }

 template<class Scalar, unsigned int block_size>
-void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
+void openclBISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
 {
    lower.subsystemPointers.assign(Nb + 1, 0);

@ -138,14 +138,14 @@ void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()

    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "BISAI buildLowerSubsystemsStructures time: "
+        out << "openclBISAI buildLowerSubsystemsStructures time: "
            << t_buildLowerSubsystemsStructures.stop() << " s";
        OpmLog::info(out.str());
    }
 }

 template<class Scalar, unsigned int block_size>
-void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
+void openclBISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
 {
    upper.subsystemPointers.assign(Nb + 1, 0);

@ -175,14 +175,14 @@ void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()

    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "BISAI buildUpperSubsystemsStructures time: "
+        out << "openclBISAI buildUpperSubsystemsStructures time: "
            << t_buildUpperSubsystemsStructures.stop() << " s";
        OpmLog::info(out.str());
    }
 }

 template<class Scalar, unsigned int block_size>
-bool BISAI<Scalar,block_size>::
+bool openclBISAI<Scalar,block_size>::
 create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;
@ -300,7 +300,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)

        if (err != CL_SUCCESS) {
            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "BISAI OpenCL enqueueWriteBuffer error");
+            OPM_THROW(std::logic_error, "openclBISAI OpenCL enqueueWriteBuffer error");
        }
    });

@ -326,7 +326,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)

    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
+        out << "openclBISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
        OpmLog::info(out.str());
    }

@ -334,14 +334,14 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 }

 template<class Scalar, unsigned int block_size>
-bool BISAI<Scalar,block_size>::
+bool openclBISAI<Scalar,block_size>::
 create_preconditioner(BlockedMatrix<Scalar>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }

 template<class Scalar, unsigned int block_size>
-void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
+void openclBISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
 {
    const unsigned int bs = block_size;

@ -354,12 +354,12 @@ void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
 }

 #define INSTANCE_TYPE(T)       \
-    template class BISAI<T,1>; \
-    template class BISAI<T,2>; \
-    template class BISAI<T,3>; \
-    template class BISAI<T,4>; \
-    template class BISAI<T,5>; \
-    template class BISAI<T,6>;
+    template class openclBISAI<T,1>; \
+    template class openclBISAI<T,2>; \
+    template class openclBISAI<T,3>; \
+    template class openclBISAI<T,4>; \
+    template class openclBISAI<T,5>; \
+    template class openclBISAI<T,6>;

 INSTANCE_TYPE(double)

--- a/opm/simulators/linalg/bda/opencl/openclBISAI.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclBISAI.hpp
@ -17,14 +17,14 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef BISAI_HPP
-#define BISAI_HPP
+#ifndef OPM_OPENCLBISAI_HPP
+#define OPM_OPENCLBISAI_HPP

 #include <mutex>

 #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
-#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>

 namespace Opm::Accelerator {

@ -33,9 +33,9 @@ template<class Scalar> class BlockedMatrix;
 /// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
 /// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
 template<class Scalar, unsigned int block_size>
-class BISAI : public Preconditioner<Scalar,block_size>
+class openclBISAI : public openclPreconditioner<Scalar,block_size>
 {
-    using Base = Preconditioner<Scalar,block_size>;
+    using Base = openclPreconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -68,7 +68,7 @@ private:
    cl::Buffer d_invL_x;

    bool opencl_ilu_parallel;
-    std::unique_ptr<BILU0<Scalar,block_size>> bilu0;
+    std::unique_ptr<openclBILU0<Scalar,block_size>> bilu0;

    /// Struct that holds the structure of the small subsystems for each column
    struct subsystemStructure {
@ -107,7 +107,7 @@ private:
    void buildUpperSubsystemsStructures();

 public:
-    BISAI(bool opencl_ilu_parallel, int verbosity);
+    openclBISAI(bool opencl_ilu_parallel, int verbosity);

    // set own Opencl variables, but also that of the bilu0 preconditioner
    void setOpencl(std::shared_ptr<cl::Context>& context,
@ -125,6 +125,7 @@ public:

    // apply preconditioner, x = prec(y)
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
+    void apply(Scalar& y, Scalar& x) {}
 };

 /// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.
--- a/opm/simulators/linalg/bda/opencl/openclCPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclCPR.cpp
@ -0,0 +1,321 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <dune/common/shared_ptr.hh>
+
+#include <opm/simulators/linalg/PreconditionerFactory.hpp>
+#include <opm/simulators/linalg/PropertyTree.hpp>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclCPR.hpp>
+#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+namespace Opm::Accelerator {
+
+using Dune::Timer;
+
+template<class Scalar, unsigned int block_size>
+openclCPR<Scalar,block_size>::openclCPR(bool opencl_ilu_parallel_, int verbosity_)
+    : Base(verbosity_)
+    , opencl_ilu_parallel(opencl_ilu_parallel_)
+{
+    bilu0 = std::make_unique<openclBILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
+}
+
+template<class Scalar, unsigned int block_size>
+void openclCPR<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+    context = context_;
+    queue = queue_;
+
+    bilu0->setOpencl(context, queue);
+}
+
+template<class Scalar, unsigned int block_size>
+bool openclCPR<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat_) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = this->Nb * block_size;
+    this->nnz = this->nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_);
+    this->mat = mat_;
+    return success;
+}
+
+template<class Scalar, unsigned int block_size>
+bool openclCPR<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = this->Nb * block_size;
+    this->nnz = this->nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_, jacMat);
+    this->mat = mat_;
+
+    return success;
+}
+
+template<class Scalar, unsigned int block_size>
+bool openclCPR<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_, jacMat);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "openclCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    this->create_preconditioner_amg(this->mat); // already points to bilu0::rmat if needed
+    
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "openclCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    // initialize OpenclMatrices and Buffers if needed
+    auto init_func = std::bind(&openclCPR::init_opencl_buffers, this);
+    std::call_once(opencl_buffers_allocated, init_func);
+
+    // upload matrices and vectors to GPU
+    opencl_upload();
+
+    return result;
+}
+
+template<class Scalar, unsigned int block_size>
+bool openclCPR<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat_) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "openclCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    this->create_preconditioner_amg(this->mat); // already points to bilu0::rmat if needed
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "openclCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    // initialize OpenclMatrices and Buffers if needed
+    auto init_func = std::bind(&openclCPR::init_opencl_buffers, this);
+    std::call_once(opencl_buffers_allocated, init_func);
+
+    // upload matrices and vectors to GPU
+    opencl_upload();
+
+    return result;
+}
+
+template<class Scalar, unsigned int block_size>
+void openclCPR<Scalar, block_size>::
+init_opencl_buffers() {
+    d_Amatrices.reserve(this->num_levels);
+    d_Rmatrices.reserve(this->num_levels - 1);
+    d_invDiags.reserve(this->num_levels - 1);
+    for (Matrix<Scalar>& m : this->Amatrices) {
+        d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
+    }
+    for (Matrix<Scalar>& m : this->Rmatrices) {
+        d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
+        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
+        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
+
+        d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
+        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
+        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
+    }
+    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->N);
+    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->N);
+    d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), this->Nb, this->Nb, this->nnzb, block_size);
+    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->Nb);
+    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->Nb);
+}
+
+template<class Scalar, unsigned int block_size>
+void openclCPR<Scalar,block_size>::opencl_upload()
+{
+    d_mat->upload(queue.get(), this->mat);
+
+    err = CL_SUCCESS;
+    events.resize(2 * this->Rmatrices.size() + 1);
+    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
+                                     sizeof(Scalar) * this->N, this->weights.data(), nullptr, &events[0]);
+    for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
+        d_Amatrices[i].upload(queue.get(), &this->Amatrices[i]);
+
+        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
+                                         sizeof(Scalar) * this->Amatrices[i].N, this->invDiags[i].data(),
+                                         nullptr, &events[2*i+1]);
+        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
+                                         sizeof(int) * this->Amatrices[i].N, this->PcolIndices[i].data(),
+                                         nullptr, &events[2*i+2]);
+    }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclCPR OpenCL enqueueWriteBuffer error");
+    }
+    for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
+        d_Rmatrices[i].upload(queue.get(), &this->Rmatrices[i]);
+    }
+}
+
+template<class Scalar, unsigned int block_size>
+void openclCPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
+{
+    OpenclMatrix<Scalar>* A = &d_Amatrices[level];
+    OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
+    int Ncur = A->Nb;
+
+    if (level == this->num_levels - 1) {
+        // solve coarsest level
+        std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
+
+        events.resize(1);
+        err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
+                                       sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "openclCPR OpenCL enqueueReadBuffer error");
+        }
+
+        // solve coarsest level using umfpack
+        this->umfpack.apply(h_x.data(), h_y.data());
+
+        events.resize(1);
+        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
+                                        sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "openclCPR OpenCL enqueueWriteBuffer error");
+        }
+        return;
+    }
+    int Nnext = d_Amatrices[level+1].Nb;
+
+    cl::Buffer& t = d_t[level];
+    cl::Buffer& f = d_f[level];
+    cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
+
+    // presmooth
+    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
+    for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i) {
+        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+    }
+
+    // move to coarser level
+    OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+    OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
+    amg_cycle_gpu(level + 1, f, u);
+    OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
+
+    // postsmooth
+    for (unsigned i = 0; i < this->num_post_smooth_steps; ++i) {
+        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
+                                        x, y, t, Ncur, 1);
+        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+    }
+}
+
+// x = prec(y)
+template<class Scalar, unsigned int block_size>
+void openclCPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
+{
+    // 0-initialize u and x vectors
+    events.resize(d_u.size() + 1);
+    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
+                                   sizeof(Scalar) * this->Nb, nullptr, &events[0]);
+    for (unsigned int i = 0; i < d_u.size(); ++i) {
+        err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
+                                        sizeof(Scalar) * this->Rmatrices[i].N, nullptr, &events[i + 1]);
+    }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
+    }
+
+    OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
+                                    d_mat->rowPointers, x, y, *d_rs, this->Nb, block_size);
+    OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, this->Nb);
+
+    amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
+
+    OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, this->pressure_idx, this->Nb);
+}
+
+template<class Scalar, unsigned int block_size>
+void openclCPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+{
+    Dune::Timer t_bilu0;
+    bilu0->apply(y, x);
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "openclCPR apply bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    apply_amg(y, x);
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "openclCPR apply amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+#define INSTANCE_TYPE(T)     \
+    template class openclCPR<T,1>; \
+    template class openclCPR<T,2>; \
+    template class openclCPR<T,3>; \
+    template class openclCPR<T,4>; \
+    template class openclCPR<T,5>; \
+    template class openclCPR<T,6>;
+
+INSTANCE_TYPE(double)
+
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclCPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclCPR.hpp
@ -17,8 +17,8 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef OPM_CPR_HPP
-#define OPM_CPR_HPP
+#ifndef OPM_OPENCLCPR_HPP
+#define OPM_OPENCLCPR_HPP

 #include <mutex>

@ -26,10 +26,11 @@
 #include <dune/istl/umfpack.hh>

 #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
-#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
 #include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/CprCreation.hpp>
 #include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
-#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>

 #include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp>

@ -39,9 +40,9 @@ template<class Scalar> class BlockedMatrix;

 /// This class implements a Constrained Pressure Residual (CPR) preconditioner
 template<class Scalar, unsigned int block_size>
-class CPR : public Preconditioner<Scalar,block_size>
+class openclCPR : public openclPreconditioner<Scalar,block_size>, public CprCreation<Scalar, block_size>
 {
-    using Base = Preconditioner<Scalar,block_size>;
+    using Base = openclPreconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -54,13 +55,8 @@ class CPR : public Preconditioner<Scalar,block_size>
    using Base::err;

 private:
-    int num_levels;
-    std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
-    std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
    std::vector<OpenclMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
-    std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
    std::vector<cl::Buffer> d_PcolIndices;
-    std::vector<std::vector<Scalar>> invDiags; // inverse of diagonal of Amatrices
    std::vector<cl::Buffer> d_invDiags;
    std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
    std::unique_ptr<cl::Buffer> d_rs;      // use before extracting the pressure
@ -69,35 +65,11 @@ private:
    std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once

-    std::unique_ptr<BILU0<Scalar,block_size>> bilu0;                    // Blocked ILU0 preconditioner
-    BlockedMatrix<Scalar>* mat = nullptr;    // input matrix, blocked
-
-    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
-    using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
-    using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
-    using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
-    std::unique_ptr<DuneAmg> dune_amg;
-    std::unique_ptr<DuneMat> dune_coarse;       // extracted pressure matrix, finest level in AMG hierarchy
-    std::shared_ptr<MatrixOperator> dune_op;    // operator, input to Dune AMG
-    std::vector<int> level_sizes;               // size of each level in the AMG hierarchy
-    std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
-    Dune::UMFPack<DuneMat> umfpack;             // dune/istl/umfpack object used to solve the coarsest level of AMG
-    bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
-    bool recalculate_aggregates = true;         // only rerecalculate if true
-    const int pressure_idx = 1;                 // hardcoded to mimic OPM
-    unsigned num_pre_smooth_steps;              // number of Jacobi smooth steps before restriction
-    unsigned num_post_smooth_steps;             // number of Jacobi smooth steps after prolongation
+    std::unique_ptr<openclBILU0<Scalar,block_size>> bilu0;                    // Blocked ILU0 preconditioner

    std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
    bool opencl_ilu_parallel;                   // whether ILU0 operation should be parallelized

-    // Analyze the AMG hierarchy build by Dune
-    void analyzeHierarchy();
-
-    // Analyze the aggregateMaps from the AMG hierarchy
-    // These can be reused, so only use when recalculate_aggregates is true
-    void analyzeAggregateMaps();
-
    // Initialize and allocate matrices and vectors
    void init_opencl_buffers();

@ -109,10 +81,8 @@ private:

    void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);

-    void create_preconditioner_amg(BlockedMatrix<Scalar>* mat);
-
 public:
-    CPR(bool opencl_ilu_parallel, int verbosity);
+    openclCPR(bool opencl_ilu_parallel, int verbosity);

    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
    bool analyze_matrix(BlockedMatrix<Scalar>* mat,
@ -125,18 +95,13 @@ public:
    // applies blocked ilu0
    // also applies amg for pressure component
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
+    void apply(Scalar& y, Scalar& x) {}

    bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
    bool create_preconditioner(BlockedMatrix<Scalar>* mat,
                               BlockedMatrix<Scalar>* jacMat) override;
 };

-// solve A^T * x = b
-// A should represent a 3x3 matrix
-// x and b are vectors with 3 elements
-template<class Scalar>
-void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x);
-
 } // namespace Opm::Accelerator

 #endif
--- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp
@ -26,6 +26,8 @@

 #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>  // defines CHOW_PATEL

+#include <opm/simulators/linalg/bda/Misc.hpp>
+
 #include <cmath>
 #include <sstream>

@ -84,12 +86,6 @@ std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
 template<class Scalar>
 std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;

-// divide A by B, and round up: return (int)ceil(A/B)
-unsigned int ceilDivision(const unsigned int A, const unsigned int B)
-{
-    return A / B + (A % B > 0);
-}
-
 template<class Scalar>
 void OpenclKernels<Scalar>::init(cl::Context *context,
                                 cl::CommandQueue *queue_,
--- a/opm/simulators/linalg/bda/opencl/openclPreconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclPreconditioner.cpp
@ -0,0 +1,74 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/TimingMacros.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBISAI.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclCPR.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
+
+#include <memory>
+#include <string>
+
+namespace Opm::Accelerator {
+
+template<class Scalar, unsigned int block_size>
+std::unique_ptr<openclPreconditioner<Scalar,block_size>>
+openclPreconditioner<Scalar,block_size>::
+create(PreconditionerType type,
+       int verbosity,
+       bool opencl_ilu_parallel)
+{
+    switch (type ) {
+    case PreconditionerType::BILU0:
+        return std::make_unique<openclBILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
+    case PreconditionerType::CPR:
+        return std::make_unique<openclCPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
+    case PreconditionerType::BISAI:
+        return std::make_unique<openclBISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
+    }
+
+    OPM_THROW(std::logic_error,
+              "Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
+}
+
+template<class Scalar, unsigned int block_size>
+void openclPreconditioner<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_,
+          std::shared_ptr<cl::CommandQueue>& queue_)
+{
+    context = context_;
+    queue = queue_;
+}
+
+#define INSTANCE_TYPE(T)                \
+    template class openclPreconditioner<T,1>; \
+    template class openclPreconditioner<T,2>; \
+    template class openclPreconditioner<T,3>; \
+    template class openclPreconditioner<T,4>; \
+    template class openclPreconditioner<T,5>; \
+    template class openclPreconditioner<T,6>;
+
+INSTANCE_TYPE(double)
+
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp
@ -0,0 +1,62 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_OPENCLPRECONDITIONER_HEADER_INCLUDED
+#define OPM_OPENCLPRECONDITIONER_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
+#include <opm/simulators/linalg/bda/Preconditioner.hpp>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class BlockedMatrix;
+
+template <class Scalar, unsigned int block_size>
+class openclPreconditioner : public Preconditioner<Scalar, block_size>
+{
+
+protected:
+    std::shared_ptr<cl::Context> context;
+    std::shared_ptr<cl::CommandQueue> queue;
+    std::vector<cl::Event> events;
+    cl_int err;
+
+    openclPreconditioner(int verbosity_) :
+    Preconditioner<Scalar, block_size>(verbosity_)
+    {};
+
+public:
+    virtual ~openclPreconditioner() = default;
+
+    static std::unique_ptr<openclPreconditioner<Scalar, block_size>> create(PreconditionerType type, int verbosity, bool opencl_ilu_parallel);
+
+    // nested Preconditioners might need to override this
+    virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
+
+    // apply preconditioner, x = prec(y)
+    virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
+ 
+    // create/update preconditioner, probably used every linear solve
+    // the version with two params can be overloaded, if not, it will default to using the one param version
+    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
+    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) = 0;
+};
+} //namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@ -71,16 +71,13 @@ openclSolverBackend(int verbosity_,
        OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
    }

-    using PreconditionerType = Preconditioner<Scalar,block_size>;
+    using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
    if (use_cpr) {
-        prec = PreconditionerType::create(PreconditionerType::Type::CPR,
-                                          opencl_ilu_parallel, verbosity);
+        prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_parallel);
    } else if (use_isai) {
-        prec = PreconditionerType::create(PreconditionerType::Type::BISAI,
-                                          opencl_ilu_parallel, verbosity);
+        prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_parallel);
    } else {
-        prec = PreconditionerType::create(PreconditionerType::Type::BILU0,
-                                          opencl_ilu_parallel, verbosity);
+        prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_parallel);
    }

    std::ostringstream out;
@ -228,8 +225,10 @@ openclSolverBackend(int verbosity_,

 template<class Scalar, unsigned int block_size>
 openclSolverBackend<Scalar,block_size>::
-openclSolverBackend(int verbosity_, int maxit_,
-                    Scalar tolerance_, bool opencl_ilu_parallel_)
+openclSolverBackend(int verbosity_,
+                    int maxit_,
+                    Scalar tolerance_,
+                    bool opencl_ilu_parallel_)
    : Base(verbosity_, maxit_, tolerance_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
@ -248,7 +247,8 @@ setOpencl(std::shared_ptr<cl::Context>& context_,

 template<class Scalar, unsigned int block_size>
 void openclSolverBackend<Scalar,block_size>::
-gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs,
+              BdaResult& res)
 {
    float it;
    Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@ -25,7 +25,7 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>

 namespace Opm::Accelerator {

@ -60,7 +60,7 @@ private:

    bool useJacMatrix = false;

-    std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
+    std::unique_ptr<openclPreconditioner<Scalar,block_size>> prec;
                                                                  // can perform blocked ILU0 and AMG on pressure component
    bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
    bool analysis_done = false;
--- a/opm/simulators/linalg/bda/rocm/hipKernels.cpp
+++ b/opm/simulators/linalg/bda/rocm/hipKernels.cpp
@ -0,0 +1,493 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <cmath>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+#include <hip/hip_runtime.h>
+
+namespace Opm {
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+// define static variables and kernels
+template<class Scalar> int  HipKernels<Scalar>::verbosity = 0;
+template<class Scalar> bool HipKernels<Scalar>::initialized = false;
+
+#ifdef __HIP__
+/// HIP kernel to multiply vector with another vector and a scalar, element-wise
+// add result to a third vector
+template<class Scalar>
+__global__ void vmul_k(const Scalar alpha,
+                       Scalar const *in1,
+                       Scalar const *in2,
+                       Scalar *out,
+                       const int N)
+{
+    unsigned int NUM_THREADS = gridDim.x;
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(idx < N){
+        out[idx] += alpha * in1[idx] * in2[idx];
+        idx += NUM_THREADS;
+    }
+}
+
+/// HIP kernel to transform blocked vector to scalar vector using pressure-weights
+// every workitem handles one blockrow
+template<class Scalar>
+__global__ void full_to_pressure_restriction_k(const Scalar *fine_y,
+                                               const Scalar *weights,
+                                               Scalar *coarse_y,
+                                               const unsigned int Nb)
+{
+    const unsigned int NUM_THREADS = gridDim.x;
+    const unsigned int block_size = 3;
+    unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(target_block_row < Nb){
+        Scalar sum = 0.0;
+        unsigned int idx = block_size * target_block_row;
+        for (unsigned int i = 0; i < block_size; ++i) {
+            sum += fine_y[idx + i] * weights[idx + i];
+        }
+        coarse_y[target_block_row] = sum;
+        target_block_row += NUM_THREADS;
+    }
+}
+
+/// HIP kernel to add the coarse pressure solution back to the finer, complete solution
+// every workitem handles one blockrow
+template<class Scalar>
+__global__ void add_coarse_pressure_correction_k(const Scalar *coarse_x,
+                                                 Scalar *fine_x,
+                                                 const unsigned int pressure_idx,
+                                                 const unsigned int Nb)
+{
+    const unsigned int NUM_THREADS = gridDim.x;
+    const unsigned int block_size = 3;
+    unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(target_block_row < Nb){
+        fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
+        target_block_row += NUM_THREADS;
+    }
+}
+
+/// HIP kernel to prolongate vector during amg cycle
+// every workitem handles one row
+template<class Scalar>
+__global__ void prolongate_vector_k(const Scalar *in,
+                                    Scalar *out,
+                                    const int *cols,
+                                    const unsigned int N)
+{
+    const unsigned int NUM_THREADS = gridDim.x;
+    unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while(row < N){
+        out[row] += in[cols[row]];
+        row += NUM_THREADS;
+    }
+}
+
+// res = rhs - mat * x
+// algorithm based on:
+// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+template<class Scalar>
+__global__ void residual_blocked_k(const Scalar *vals,
+                                   const int *cols,
+                                   const int *rows,
+                                   const int Nb,
+                                   const Scalar *x,
+                                   const Scalar *rhs,
+                                   Scalar *out,
+                                   const unsigned int block_size)
+{
+    extern __shared__ Scalar tmp[];
+    const unsigned int warpsize = warpSize;
+    const unsigned int bsize = blockDim.x;
+    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int idx_b = gid / bsize;
+    const unsigned int idx_t = threadIdx.x;
+    unsigned int idx = idx_b * bsize + idx_t;
+    const unsigned int bs = block_size;
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = gridDim.x;
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int target_block_row = idx / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+    
+    // for 3x3 blocks:
+    // num_active_threads: 27 (CUDA) vs 63 (ROCM)
+    // num_blocks_per_warp: 3 (CUDA) vs  7 (ROCM)
+    int offsetTarget = warpsize == 64 ? 48 : 32; 
+    
+    while(target_block_row < Nb){
+        unsigned int first_block = rows[target_block_row];
+        unsigned int last_block = rows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        Scalar local_out = 0.0;
+
+        if(lane < num_active_threads){
+            for(; block < last_block; block += num_blocks_per_warp){
+                Scalar x_elem = x[cols[block]*bs + c];
+                Scalar A_elem = vals[block*bs*bs + c + r*bs];
+                local_out += x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out; 
+        
+        for(unsigned int offset = 3; offset <= offsetTarget; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            __syncthreads();
+        }
+        
+        if(lane < bs){
+            unsigned int row = target_block_row*bs + lane;
+            out[row] = rhs[row] - tmp[lane];
+        }
+        target_block_row += num_warps_in_grid;
+    }
+}
+
+// KERNEL residual_k    
+// res = rhs - mat * x
+// algorithm based on:
+// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+// template <unsigned shared_mem_size>
+template<class Scalar>
+__global__ void residual_k(const Scalar *vals,
+                           const int *cols,
+                           const int *rows,
+                           const int N,
+                           const Scalar *x,
+                           const Scalar *rhs,
+                           Scalar *out)
+{
+    extern __shared__ Scalar tmp[];
+    const unsigned int bsize = blockDim.x;
+    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int idx_b = gid / bsize;
+    const unsigned int idx_t = threadIdx.x;
+    const unsigned int num_workgroups = gridDim.x;
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        Scalar local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        __syncthreads();
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            __syncthreads();
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] = rhs[row] - tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
+
+template<class Scalar>
+__global__ void spmv_k(const Scalar *vals,
+                       const int *cols,
+                       const int *rows,
+                       const int N,
+                       const Scalar *x,
+                       Scalar *out)
+{
+    extern __shared__ Scalar tmp[];
+    const unsigned int bsize = blockDim.x;
+    const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int idx_b = gid / bsize; 
+    const unsigned int idx_t = threadIdx.x;
+    const unsigned int num_workgroups = gridDim.x;
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        Scalar local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        __syncthreads();
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            __syncthreads();
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] = tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
+#endif
+
+template<class Scalar>
+void HipKernels<Scalar>::
+init(int verbosity_)
+{
+    if (initialized) {
+        OpmLog::debug("Warning HipKernels is already initialized");
+        return;
+    }
+
+    verbosity = verbosity_;
+
+    initialized = true;
+}
+
+template<class Scalar>
+void HipKernels<Scalar>::
+vmul(const Scalar alpha,
+     Scalar* in1,
+     Scalar* in2,
+     Scalar* out,
+     int N,
+     hipStream_t stream)
+{
+    Timer t_vmul;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(N, blockDim);
+    unsigned gridDim = blocks * blockDim;
+
+    // dim3(N) will create a vector {N, 1, 1}
+    vmul_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(alpha, in1, in2, out, N);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error vmul for rocsparse only supported when compiling with hipcc");
+#endif
+
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels vmul() time: " << t_vmul.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template<class Scalar>
+void HipKernels<Scalar>::
+full_to_pressure_restriction(const Scalar* fine_y,
+                             Scalar* weights,
+                             Scalar* coarse_y,
+                             int Nb,
+                             hipStream_t stream)
+{
+    Timer t;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = blocks * blockDim;
+
+    // dim3(N) will create a vector {N, 1, 1}
+    full_to_pressure_restriction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(fine_y, weights, coarse_y, Nb);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error full_to_pressure_restriction for rocsparse only supported when compiling with hipcc");
+#endif
+    
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels full_to_pressure_restriction() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template<class Scalar>
+void HipKernels<Scalar>::
+add_coarse_pressure_correction(Scalar* coarse_x,
+                               Scalar* fine_x,
+                               int pressure_idx,
+                               int Nb,
+                               hipStream_t stream)
+{
+    Timer t;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = blocks * blockDim;
+
+    // dim3(N) will create a vector {N, 1, 1}
+    add_coarse_pressure_correction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(coarse_x, fine_x, pressure_idx, Nb);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error add_coarse_pressure_correction for rocsparse only supported when compiling with hipcc");
+#endif
+
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template<class Scalar>
+void HipKernels<Scalar>::
+prolongate_vector(const Scalar* in,
+                  Scalar* out,
+                  const int* cols,
+                  int N,
+                  hipStream_t stream)
+{
+    Timer t;
+    
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    unsigned blocks = Accelerator::ceilDivision(N, blockDim);
+    unsigned gridDim = blocks * blockDim;
+    unsigned shared_mem_size = blockDim * sizeof(Scalar); 
+
+    // dim3(N) will create a vector {N, 1, 1}
+    prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error prolongate_vector for rocsparse only supported when compiling with hipcc");
+#endif
+
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels prolongate_vector() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template<class Scalar>
+void HipKernels<Scalar>::
+residual(Scalar* vals,
+         int* cols,
+         int* rows,
+         Scalar* x,
+         const Scalar* rhs,
+         Scalar* out,
+         int Nb,
+         unsigned int block_size,
+         hipStream_t stream)
+{
+    Timer t_residual;
+
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = num_work_groups * blockDim;
+    unsigned shared_mem_size = blockDim * sizeof(Scalar); 
+
+    if (block_size > 1) {
+        // dim3(N) will create a vector {N, 1, 1}
+        residual_blocked_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out, block_size);
+    } else {
+        residual_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out);
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error residual for rocsparse only supported when compiling with hipcc");
+#endif
+    
+    if (verbosity >= 4) {
+        HIP_CHECK(hipStreamSynchronize(stream));
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels residual() time: " << t_residual.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template<class Scalar>
+void HipKernels<Scalar>::
+spmv(Scalar* vals,
+     int* cols,
+     int* rows,
+     Scalar* x,
+     Scalar* y,
+     int Nb,
+     unsigned int block_size,
+     hipStream_t stream)
+{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
+    Timer t_spmv;
+#ifdef __HIP__
+    unsigned blockDim = 64;
+    const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
+    unsigned gridDim = num_work_groups * blockDim;
+    unsigned shared_mem_size = blockDim * sizeof(Scalar); 
+
+   spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
+    
+    HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error spmv for rocsparse only supported when compiling with hipcc");
+#endif
+    
+    if (verbosity >= 4) {
+        std::ostringstream oss;
+        oss << std::scientific << "HipKernels spmv_blocked() time: " << t_spmv.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template class HipKernels<double>;
+
+} // namespace Opm
--- a/opm/simulators/linalg/bda/rocm/hipKernels.hpp
+++ b/opm/simulators/linalg/bda/rocm/hipKernels.hpp
@ -0,0 +1,138 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_HIPKERNELS_HPP
+#define OPM_HIPKERNELS_HPP
+
+#include <string>
+#include <memory>
+#include <cstddef>
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_version.h>
+
+namespace Opm {
+
+template<class Scalar>
+class HipKernels
+{
+private:
+    static int  verbosity;
+    static bool initialized;
+    
+    HipKernels();
+    
+public:
+    /// Initialize verbosity level for the HIP kernels
+    /// \param[in] verbosity   verbosity level
+    static void init(int verbosity);
+    
+    /// Transform blocked vector to scalar vector using pressure-weights, where every workitem handles one blockrow
+    /// \param[in]  fine_y     Input y vector
+    /// \param[in]  weights    Weights used to combine cells
+    /// \param[out] course_y   Output y vector 
+    /// \param[in]  Nb         Number of blocks in the original matrix
+    /// \param[in]  stream     Hip stream to use for the computations
+    static void full_to_pressure_restriction(const Scalar* fine_y,
+                                             Scalar* weights,
+                                             Scalar* coarse_y,
+                                             int Nb,
+                                             hipStream_t stream);
+
+    /// Add the coarse pressure solution back to the finer, complete solution; every workitem handles one blockrow
+    /// \param[in]  coarse_x     Input scalar x vector
+    /// \param[out] fine_x       Output blocked x vector 
+    /// \param[in]  pressure_idx Pressure index
+    /// \param[in]  Nb           Number of blocks in the original matrix
+    /// \param[in]  stream       Hip stream to use for the computations
+    static void add_coarse_pressure_correction(Scalar* coarse_x,
+                                               Scalar* fine_x,
+                                               int pressure_idx,
+                                               int Nb,
+                                               hipStream_t stream);
+
+
+    /// Function to multiply vector with another vector and a scalar, element-wise and add the result to a third vector (out = alpha * in1 + in2) 
+    /// \param[in]  alpha   Input scalar 
+    /// \param[in]  in1     First input vector 
+    /// \param[in]  in2     Second input vector
+    /// \param[out] out     Output vector
+    /// \param[in]  N       Size of the vector
+    /// \param[in]  stream  Hip stream to use for the computations
+    static void vmul(const Scalar alpha,
+                     Scalar* in1,
+                     Scalar* in2,
+                     Scalar* out,
+                     int N,
+                     hipStream_t stream);
+    
+    /// Function to prolongate vector during amg cycle, every workitem handles one row
+    /// \param[in]  in      Input fine-grained vector
+    /// \param[out] out     Output course-graned vector 
+    /// \param[in]  cols    Column indexes
+    /// \param[in]  N       Size of the vector
+    /// \param[in]  stream  Hip stream to use for the computations
+    static void prolongate_vector(const Scalar* in,
+                                  Scalar* out,
+                                  const int* cols,
+                                  int N,
+                                  hipStream_t stream);
+
+    /// Function to perform res = rhs - mat * x
+    /// \param[in]  vals       Matrix values
+    /// \param[in]  cols       Column indexes
+    /// \param[in]  rows       Row pointers
+    /// \param[in]  x          X vector
+    /// \param[in]  rhs        Rhs vector
+    /// \param[out] out        Output res vector
+    /// \param[in]  Nb         Number of non-zero blocks in the original matrix
+    /// \param[in]  block_size Block size
+    /// \param[in]  stream     Hip stream to use for the computations
+    static void residual(Scalar* vals,
+                         int* cols,
+                         int* rows,
+                         Scalar* x,
+                         const Scalar* rhs,
+                         Scalar* out,
+                         int Nb,
+                         unsigned int block_size,
+                         hipStream_t stream);
+
+    /// Function to perform sparse matrix vector multipliation
+    /// \param[in]  vals       Matrix values
+    /// \param[in]  cols       Column indexes
+    /// \param[in]  rows       Row pointers
+    /// \param[in]  x          Input x vector
+    /// \param[out] y          Output y vector
+    /// \param[in]  Nb         Number of non-zero blocks in the original matrix
+    /// \param[in]  block_size Block size
+    /// \param[in]  stream     Hip stream to use for the computations
+    static void spmv(Scalar* vals,
+                     int* cols,
+                     int* rows,
+                     Scalar* x,
+                     Scalar* y,
+                     int Nb,
+                     unsigned int block_size,
+                     hipStream_t stream);
+};
+
+} // namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp
@ -37,7 +37,7 @@

 #undef HAVE_CUDA

-#include <opm/simulators/linalg/bda/rocalutionSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp>

 #include <rocalution.hpp>
 #include <base/matrix_formats_ind.hpp> // check if blocks are interpreted as row-major or column-major
--- a/opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
--- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
@ -0,0 +1,286 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <algorithm>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp> 
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+#include <sstream>
+
+#include <thread>
+extern std::shared_ptr<std::thread> copyThread;
+   
+#if HAVE_OPENMP
+#include <omp.h>
+#endif //HAVE_OPENMP
+
+namespace Opm::Accelerator {
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <class Scalar, unsigned int block_size>
+rocsparseBILU0<Scalar, block_size>::
+rocsparseBILU0(int verbosity_) : 
+    Base(verbosity_)
+{
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseBILU0<Scalar, block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+           rocsparse_int *d_Arows,
+           rocsparse_int *d_Acols) 
+{ 
+    this->Nb = matrix->Nb;
+    this->N = Nb * block_size;
+    this->nnzb = matrix->nnzbs;
+    this->nnz = nnzb * block_size * block_size;
+    this->nnzbs_prec = this->nnzb;
+
+    if (jacMatrix) {
+        this->useJacMatrix = true;
+        this->nnzbs_prec = jacMatrix->nnzbs;
+        this->jacMat = jacMatrix;
+    }
+
+    HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * this->N));
+
+    if (this->useJacMatrix) {
+        HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
+        HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * this->nnzbs_prec));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
+    } else { // preconditioner matrix is same
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
+        d_Mcols = d_Acols;
+        d_Mrows = d_Arows;
+    }
+    
+    return true;
+} // end initialize()
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseBILU0<Scalar, block_size>::
+analyze_matrix(BlockedMatrix<Scalar> *mat) {
+    return analyze_matrix(mat, &(*this->jacMat));
+}
+
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseBILU0<Scalar, block_size>::
+analyze_matrix(BlockedMatrix<Scalar> *mat, 
+               BlockedMatrix<Scalar> *jacMat) 
+{
+    std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
+    Timer t;
+    
+    ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
+#if HIP_VERSION >= 50400000
+    ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
+#endif
+
+    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
+
+    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
+    ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
+    ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
+
+    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
+    ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
+    ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
+    
+    ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(this->handle, this->dir, Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
+    
+    ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(this->handle, this->dir, this->operation, Nb, this->nnzbs_prec,
+                               descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
+
+    ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(this->handle, this->dir, this->operation, Nb, this->nnzbs_prec,
+                               descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
+
+    d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+
+    HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
+
+    // analysis of ilu LU decomposition
+    ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(this->handle, this->dir, \
+                               Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
+                               block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
+
+    int zero_position = 0;
+    rocsparse_status status = rocsparse_bsrilu0_zero_pivot(this->handle, ilu_info, &zero_position);
+    if (rocsparse_status_success != status) {
+        printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
+        return false;
+    }
+
+    // analysis of ilu apply
+    ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(this->handle, this->dir, this->operation, \
+                             Nb, this->nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
+                             block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
+    ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(this->handle, this->dir, this->operation, \
+                             Nb, this->nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
+                             block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
+
+    if (verbosity >= 3) {
+    	HIP_CHECK(hipStreamSynchronize(this->stream));
+        std::ostringstream out;
+        out << "rocsparseBILU0::analyze_matrix(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    return true;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseBILU0<Scalar, block_size>::
+create_preconditioner(BlockedMatrix<Scalar> *mat) {
+    return create_preconditioner(mat, &*this->jacMat);
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseBILU0<Scalar, block_size>::
+create_preconditioner(BlockedMatrix<Scalar> *mat, 
+                      BlockedMatrix<Scalar> *jacMat) 
+{
+    Timer t;
+    bool result = true;
+    
+    ROCSPARSE_CHECK(rocsparse_dbsrilu0(this->handle, this->dir, Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
+
+    // Check for zero pivot
+    int zero_position = 0;
+    rocsparse_status status = rocsparse_bsrilu0_zero_pivot(this->handle, ilu_info, &zero_position);
+    if(rocsparse_status_success != status)
+    {
+        printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
+        return false;
+    }
+
+    if (verbosity >= 3) {
+        HIP_CHECK(hipStreamSynchronize(this->stream));
+        std::ostringstream out;
+        out << "rocsparseBILU0::create_preconditioner(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    return result;
+} // end create_preconditioner()
+
+template <class Scalar, unsigned int block_size>
+void rocsparseBILU0<Scalar, block_size>::
+copy_system_to_gpu(Scalar *d_Avals) {
+    Timer t;
+    bool use_multithreading = true;
+
+#if HAVE_OPENMP
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
+#endif
+
+    if (this->useJacMatrix) {
+        if (use_multithreading) {
+            copyThread->join();
+        }
+
+        HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
+    } else {
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
+    }
+
+    if (verbosity >= 3) {
+        HIP_CHECK(hipStreamSynchronize(this->stream));
+        std::ostringstream out;
+        out << "rocsparseBILU0::copy_system_to_gpu(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end copy_system_to_gpu()
+
+// don't copy rowpointers and colindices, they stay the same
+template <class Scalar, unsigned int block_size>
+void rocsparseBILU0<Scalar, block_size>::
+update_system_on_gpu(Scalar *d_Avals) {
+    Timer t;
+    bool use_multithreading = true;
+
+#if HAVE_OPENMP
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
+#endif
+
+    if (this->useJacMatrix) {
+        if (use_multithreading) {
+           copyThread->join();
+        }
+
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
+    } else {
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
+    }
+
+    if (verbosity >= 3) {
+        HIP_CHECK(hipStreamSynchronize(this->stream));
+        std::ostringstream out;
+        out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system_on_gpu()
+
+template <class Scalar, unsigned int block_size>
+void rocsparseBILU0<Scalar, block_size>::
+apply(Scalar& y, Scalar& x) {
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;
+
+    Timer t_apply;
+
+    ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(this->handle, this->dir, \
+                              this->operation, Nb, this->nnzbs_prec, &one, \
+                              descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &y, d_t, rocsparse_solve_policy_auto, d_buffer));
+
+    ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(this->handle, this->dir, \
+                              this->operation, Nb, this->nnzbs_prec, &one, \
+                              descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, &x, rocsparse_solve_policy_auto, d_buffer));
+        
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        HIP_CHECK(hipStreamSynchronize(this->stream));
+        out << "rocsparseBILU0 apply: " << t_apply.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+#define INSTANCE_TYPE(T)                \
+    template class rocsparseBILU0<T,1>; \
+    template class rocsparseBILU0<T,2>; \
+    template class rocsparseBILU0<T,3>; \
+    template class rocsparseBILU0<T,4>; \
+    template class rocsparseBILU0<T,5>; \
+    template class rocsparseBILU0<T,6>;
+
+INSTANCE_TYPE(double)
+} // namespace Opm
--- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
@ -0,0 +1,125 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ROCSPARSEBILU0_HPP
+#define OPM_ROCSPARSEBILU0_HPP
+
+#include <mutex>
+#include <vector>
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
+
+#include <rocblas/rocblas.h>
+#include <rocsparse/rocsparse.h>
+
+#include <hip/hip_version.h>
+
+namespace Opm::Accelerator {
+
+/// This class implements a Blocked ILU0 preconditioner
+/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
+/// The preconditioner is applied via two exact triangular solves
+template <class Scalar, unsigned int block_size>
+class rocsparseBILU0 : public rocsparsePreconditioner<Scalar, block_size>
+{
+    typedef rocsparsePreconditioner<Scalar, block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+
+private:
+   
+    rocsparse_mat_descr descr_M, descr_L, descr_U;
+    rocsparse_mat_info ilu_info;
+#if HIP_VERSION >= 50400000
+    rocsparse_mat_info spmv_info;
+#endif
+
+    rocsparse_int *d_Mrows, *d_Mcols; 
+    Scalar *d_Mvals, *d_t;
+    void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
+    
+public:
+
+    rocsparseBILU0(int verbosity_);
+    
+    /// Initialize GPU and allocate memory
+    /// \param[in] matrix     matrix A
+    /// \param[in] jacMatrix  matrix for preconditioner
+    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                    rocsparse_int *d_Arows,
+                    rocsparse_int *d_Acols);
+
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat);
+    
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, analyze this as well
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat,
+                        BlockedMatrix<Scalar> *jacMat);
+
+    /// ILU decomposition
+    /// \param[in] mat     matrix A to decompose
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat) override;
+    
+    /// ILU decomposition
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, decompose this one if used
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat,
+                               BlockedMatrix<Scalar> *jacMat) override;
+
+    /// Apply preconditioner, x = prec(y)
+    /// via Lz = y
+    /// and Ux = z
+    /// \param[in]  y  Input y vector
+    /// \param[out] x  Output x vector
+    void apply(Scalar& y,
+               Scalar& x) override;
+
+#if HAVE_OPENCL
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y,
+               cl::Buffer& x) {}
+#endif
+
+    /// Copy matrix A values to GPU
+    /// \param[in]  mVals  Input values
+    void copy_system_to_gpu(Scalar *mVals);
+
+    /// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
+//     /// \param[in]  vals  New values
+//     /// \param[in] b     New b vector
+//     void update_system(Scalar *vals, Scalar *b);
+    
+    /// Update GPU values after a new assembly is done
+    /// \param[in] b     New b vector
+    void update_system_on_gpu(Scalar *b);
+};
+} // namespace Opm
+
+#endif
+
--- a/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
@ -0,0 +1,338 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <dune/common/shared_ptr.hh>
+
+#include <opm/simulators/linalg/PreconditionerFactory.hpp>
+#include <opm/simulators/linalg/PropertyTree.hpp>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
+#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+namespace Opm::Accelerator {
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <class Scalar, unsigned int block_size>
+rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :
+    rocsparsePreconditioner<Scalar, block_size>(verbosity_)
+{
+    bilu0 = std::make_unique<rocsparseBILU0<Scalar, block_size> >(verbosity_);
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+           rocsparse_int *d_Arows,
+           rocsparse_int *d_Acols)
+{
+    this->Nb = matrix->Nb;
+    this->nnzb = matrix->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bilu0->set_context(this->handle, this->dir, this->operation, this->stream);
+    bilu0->setJacMat(*this->jacMat.get());
+    bilu0->initialize(matrix,jacMatrix,d_Arows,d_Acols);
+    return true;
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+copy_system_to_gpu(Scalar *b) {
+    bilu0->copy_system_to_gpu(b);
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+update_system_on_gpu(Scalar *vals) {
+    bilu0->update_system_on_gpu(vals);
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+analyze_matrix(BlockedMatrix<Scalar> *mat_) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+    
+    bool success = bilu0->analyze_matrix(mat_);
+    
+    this->mat = mat_;  
+    
+    return success;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+analyze_matrix(BlockedMatrix<Scalar> *mat_,
+               BlockedMatrix<Scalar> *jacMat)
+{
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_, jacMat);
+    this->mat = mat_;
+
+    return success;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+create_preconditioner(BlockedMatrix<Scalar> *mat_,
+                      BlockedMatrix<Scalar> *jacMat)
+{
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_, jacMat);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    Dune::Timer t_amg;
+    this->create_preconditioner_amg(this->mat);
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
+    std::call_once(rocm_buffers_allocated, init_func);
+
+    // upload matrices and vectors to GPU
+    rocm_upload();
+
+    return result;
+}
+
+template <class Scalar, unsigned int block_size>
+bool rocsparseCPR<Scalar, block_size>::
+create_preconditioner(BlockedMatrix<Scalar> *mat_) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    this->create_preconditioner_amg(this->mat); 
+    
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    
+    auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
+    std::call_once(rocm_buffers_allocated, init_func);
+
+    // upload matrices and vectors to GPU
+    rocm_upload();
+    
+    return result;
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+init_rocm_buffers() {
+    d_Amatrices.reserve(this->num_levels);
+    d_Rmatrices.reserve(this->num_levels - 1);
+    d_invDiags.reserve(this->num_levels - 1);
+    for (Matrix<Scalar>& m : this->Amatrices) {
+        d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);
+    }
+    
+    for (Matrix<Scalar>& m : this->Rmatrices) {
+        d_Rmatrices.emplace_back(m.N, m.M, m.nnzs, 1);
+        d_f.emplace_back(m.N);
+        d_u.emplace_back(m.N);
+        d_PcolIndices.emplace_back(m.M);
+        d_invDiags.emplace_back(m.M);
+        d_t.emplace_back(m.M);
+    }
+    
+    d_weights.emplace_back(this->N);
+    d_rs.emplace_back(this->N);
+    d_mat = std::make_unique<RocmMatrix<Scalar>>(this->Nb, this->Nb, this->nnzb, block_size);
+    
+    d_coarse_y.emplace_back(this->Nb);
+    d_coarse_x.emplace_back(this->Nb);
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+rocm_upload() {
+     d_mat->upload(this->mat, this->stream);
+     
+     HIP_CHECK(hipMemcpyAsync(d_weights.data()->nnzValues, this->weights.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, this->stream));
+    
+    for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
+        d_Amatrices[i].upload(&this->Amatrices[i], this->stream);
+        
+        HIP_CHECK(hipMemcpyAsync(d_invDiags[i].nnzValues, this->invDiags[i].data(), sizeof(Scalar) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
+        HIP_CHECK(hipMemcpyAsync(d_PcolIndices[i].nnzValues, this->PcolIndices[i].data(), sizeof(int) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
+    }
+    
+    for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
+        d_Rmatrices[i].upload(&this->Rmatrices[i], this->stream);
+    }
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+amg_cycle_gpu(const int level,
+              Scalar &y,
+              Scalar &x)
+{
+    RocmMatrix<Scalar> *A = &d_Amatrices[level];
+    RocmMatrix<Scalar> *R = &d_Rmatrices[level];
+    int Ncur = A->Nb;
+    Scalar zero = 0.0;
+    Scalar one = 1.0;
+    
+    rocsparse_mat_info spmv_info;
+    rocsparse_mat_descr descr_R;
+    ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
+    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_R));
+
+    if (level == this->num_levels - 1) {
+        // solve coarsest level
+        std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
+
+        HIP_CHECK(hipMemcpyAsync(h_y.data(), &y, sizeof(Scalar) * Ncur, hipMemcpyDeviceToHost, this->stream));
+        
+        // solve coarsest level using umfpack
+        this->umfpack.apply(h_x.data(), h_y.data());
+
+        HIP_CHECK(hipMemcpyAsync(&x, h_x.data(), sizeof(Scalar) * Ncur, hipMemcpyHostToDevice, this->stream));
+        
+        return;
+    }
+    
+    int Nnext = d_Amatrices[level+1].Nb;
+
+    RocmVector<Scalar>& t = d_t[level];
+    RocmVector<Scalar>& f = d_f[level];
+    RocmVector<Scalar>& u = d_u[level]; // u was 0-initialized earlier
+
+    // presmooth
+    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
+    for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
+        HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+        HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
+    }
+
+    // move to coarser level
+    HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+
+// TODO: understand why rocsparse spmv library function does not here. 
+//     ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
+//                                          R->Nb, R->Mb, R->nnzbs, &one, descr_R,
+//                                          R->nnzValues, R->rowPointers, R->colIndices, 1,
+//                                          t.nnzValues, &zero, f.nnzValues));
+    HipKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
+
+    amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
+    HipKernels<Scalar>::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
+
+    // postsmooth
+    for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
+        HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
+        HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
+    }
+}
+
+// x = prec(y)
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+apply_amg(const Scalar& y, 
+          Scalar& x)
+{
+    HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
+    
+    for (unsigned int i = 0; i < d_u.size(); ++i) {
+        d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
+    }
+    
+    HipKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
+    
+    HipKernels<Scalar>::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
+    
+    amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));
+
+    HipKernels<Scalar>::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparseCPR<Scalar, block_size>::
+apply(Scalar& y,
+      Scalar& x)
+{
+    Dune::Timer t_bilu0;
+
+    bilu0->apply(y, x);
+
+    if (verbosity >= 3) {
+        HIP_CHECK(hipStreamSynchronize(this->stream));
+        std::ostringstream out;
+        out << "rocsparseCPR apply bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    apply_amg(y, x);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "rocsparseCPR apply amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+#define INSTANCE_TYPE(T)                \
+    template class rocsparseCPR<T,1>; \
+    template class rocsparseCPR<T,2>; \
+    template class rocsparseCPR<T,3>; \
+    template class rocsparseCPR<T,4>; \
+    template class rocsparseCPR<T,5>; \
+    template class rocsparseCPR<T,6>;
+
+INSTANCE_TYPE(double)
+} // namespace Opm
+
+
--- a/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
@ -0,0 +1,142 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ROCSPARSECPR_HPP
+#define OPM_ROCSPARSECPR_HPP
+
+#include <mutex>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/CprCreation.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class BlockedMatrix;
+
+/// This class implements a Constrained Pressure Residual (CPR) preconditioner
+template <class Scalar, unsigned int block_size>
+class rocsparseCPR : public rocsparsePreconditioner<Scalar, block_size>, public CprCreation<Scalar, block_size>
+{
+    typedef rocsparsePreconditioner<Scalar, block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+
+private:
+    std::vector<RocmMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
+    
+    std::vector<RocmVector<int>> d_PcolIndices; // prolongation does not need a full matrix, only store colIndices
+    std::vector<RocmVector<Scalar>> d_invDiags; // inverse of diagonal of Amatrices
+    std::vector<RocmVector<Scalar>> d_t, d_f; // intermediate vectors used during amg cycle
+    std::vector<RocmVector<Scalar>> d_u; // intermediate vectors used during amg cycle
+    std::vector<RocmVector<Scalar>> d_rs;      // use before extracting the pressure
+    std::vector<RocmVector<Scalar>> d_weights; // the quasiimpes weights, used to extract pressure
+    std::unique_ptr<RocmMatrix<Scalar>> d_mat;   // stores blocked matrix
+    std::vector<RocmVector<Scalar>> d_coarse_y, d_coarse_x; // stores the scalar vectors
+    std::once_flag rocm_buffers_allocated;  // only allocate OpenCL Buffers once
+
+    std::unique_ptr<rocsparseBILU0<Scalar, block_size> > bilu0;                    // Blocked ILU0 preconditioner
+
+    std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar
+
+    // Initialize and allocate matrices and vectors
+    void init_rocm_buffers();
+
+    // Copy matrices and vectors to GPU
+    void rocm_upload();
+
+    // apply pressure correction to vector
+    void apply_amg(const Scalar& y, Scalar& x);
+
+    // Apply the AMG preconditioner
+    void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
+    
+public:
+
+    rocsparseCPR(int verbosity);
+
+    /// Initialize GPU and allocate memory
+    /// \param[in] matrix     matrix A
+    /// \param[in] jacMatrix  matrix for preconditioner
+    bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                    rocsparse_int *d_Arows,
+                    rocsparse_int *d_Acols);
+    
+
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat);
+    
+    /// Analysis, extract parallelism if specified
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, analyze this as well
+    bool analyze_matrix(BlockedMatrix<Scalar> *mat,
+                        BlockedMatrix<Scalar> *jacMat);
+
+    /// Create AMG preconditioner and perform ILU decomposition
+    /// \param[in] mat     matrix A
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat);
+    
+    /// Create AMG preconditioner and perform ILU decomposition
+    /// \param[in] mat     matrix A
+    /// \param[in] jacMat  matrix for preconditioner, decompose this one if used
+    bool create_preconditioner(BlockedMatrix<Scalar> *mat,
+                               BlockedMatrix<Scalar> *jacMat);
+    
+    /// Apply preconditioner, x = prec(y)
+    /// applies blocked ilu0
+    /// also applies amg for pressure component
+    /// \param[in]  y  Input y vector
+    /// \param[out] x  Output x vector
+    void apply(Scalar& y,
+               Scalar& x) override;
+    
+#if HAVE_OPENCL
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y,
+               cl::Buffer& x) {}
+#endif
+    
+    /// Copy matrix A values to GPU
+    /// \param[in]  mVals  Input values
+    void copy_system_to_gpu(Scalar *b);
+
+    /// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] b              input vector b, contains N values
+//     void update_system(Scalar *vals, Scalar *b);
+
+    /// Update linear system to GPU
+    /// \param[in] b              input vector, contains N values
+    void update_system_on_gpu(Scalar *b);
+};
+
+} // namespace Opm
+
+#endif
+
--- a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
@ -0,0 +1,113 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/Misc.hpp>
+
+#include <sstream>
+#include <iostream>
+
+namespace Opm::Accelerator {
+
+template<class Scalar>
+RocmMatrix<Scalar>::
+RocmMatrix(int Nb_, 
+           int Mb_,
+           int nnzbs_,
+           unsigned int block_size_)
+    : Nb(Nb_),
+      Mb(Mb_),
+      nnzbs(nnzbs_),
+      block_size(block_size_)
+{
+    HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * block_size * block_size * nnzbs));
+        
+    HIP_CHECK(hipMalloc((void**)&colIndices, sizeof(int) * nnzbs));
+
+    HIP_CHECK(hipMalloc((void**)&rowPointers, sizeof(int) * (Nb + 1)));
+}
+
+template <class Scalar>
+void RocmMatrix<Scalar>::
+upload(Scalar *vals,
+       int *cols,
+       int *rows,
+       hipStream_t stream)
+{
+    HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
+    
+    HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
+    
+    HIP_CHECK(hipMemcpyAsync(rowPointers, rows, sizeof(int) * (Nb + 1), hipMemcpyHostToDevice, stream));
+}
+
+template <class Scalar>
+void RocmMatrix<Scalar>::
+upload(Matrix<Scalar> *matrix,
+       hipStream_t stream)
+{
+    if (block_size != 1) {
+        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
+    }
+
+    upload(matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data(), stream);
+}
+
+template <class Scalar>
+void RocmMatrix<Scalar>::
+upload(BlockedMatrix<Scalar> *matrix,
+       hipStream_t stream)
+{
+    if (matrix->block_size != block_size) {
+        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
+    }
+
+    upload(matrix->nnzValues, matrix->colIndices, matrix->rowPointers, stream);
+}
+
+template <class Scalar>
+RocmVector<Scalar>::RocmVector(int N)
+    : size(N)
+{
+    HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * N));
+}
+
+template <class Scalar>
+void RocmVector<Scalar>::
+upload(Scalar *vals,
+       hipStream_t stream) 
+{
+    HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));    
+}
+
+#define INSTANCE_TYPE(T)  \
+template class RocmVector<T>;\
+template class RocmMatrix<T>;
+
+INSTANCE_TYPE(int);
+INSTANCE_TYPE(double);
+
+} // namespace Opm
--- a/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
@ -0,0 +1,73 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ROCMMATRIX_HEADER_INCLUDED
+#define OPM_ROCMMATRIX_HEADER_INCLUDED
+
+#include <hip/hip_runtime_api.h>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class Matrix;
+template<class Scalar> class BlockedMatrix;
+
+/// This struct resembles a csr matrix
+template<class Scalar>
+class RocmMatrix {
+public:
+
+    RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);
+
+    void upload(Scalar *vals,
+                int *cols,
+                int *rows,
+                hipStream_t stream);
+    
+    void upload(Matrix<Scalar> *matrix,
+                hipStream_t stream);
+    
+    void upload(BlockedMatrix<Scalar> *matrix,
+                hipStream_t stream);
+
+    Scalar* nnzValues;
+    int* colIndices;
+    int* rowPointers;
+    int Nb, Mb;
+    int nnzbs;
+    unsigned int block_size;
+};
+
+template <typename Scalar>
+class RocmVector {
+public:
+
+    RocmVector(int N);
+
+    void upload(Scalar *vals,
+                hipStream_t stream);
+    
+    void upload(Matrix<Scalar> *matrix,
+                hipStream_t stream);
+
+    Scalar* nnzValues;
+    int size;
+};
+} // namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp
@ -0,0 +1,86 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <memory>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
+
+namespace Opm::Accelerator {
+
+template <class Scalar, unsigned int block_size>
+std::unique_ptr<rocsparsePreconditioner<Scalar,block_size> > rocsparsePreconditioner<Scalar,block_size>::
+create(PreconditionerType type,
+       int verbosity)
+{
+    switch (type ) {
+    case PreconditionerType::BILU0:
+        return std::make_unique<Opm::Accelerator::rocsparseBILU0<Scalar, block_size> >(verbosity);
+    case PreconditionerType::CPR:
+        return std::make_unique<Opm::Accelerator::rocsparseCPR<Scalar, block_size> >(verbosity);
+    }
+
+    OPM_THROW(std::logic_error,
+              "Invalid preconditioner type " + std::to_string(static_cast<int>(type)));   
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparsePreconditioner<Scalar, block_size>::
+set_matrix_analysis(rocsparse_mat_descr descr_L,
+                    rocsparse_mat_descr descr_U)
+{
+    descr_L = descr_L;
+    descr_U = descr_U;
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparsePreconditioner<Scalar, block_size>::
+set_context(rocsparse_handle handle,
+            rocsparse_direction dir,
+            rocsparse_operation operation,
+            hipStream_t stream)
+{
+    this->handle = handle;
+    this->dir = dir;
+    this->operation = operation;
+    this->stream = stream;
+}
+
+template <class Scalar, unsigned int block_size>
+void rocsparsePreconditioner<Scalar, block_size>::
+setJacMat(BlockedMatrix<Scalar> jacMat) {
+    this->jacMat = std::make_shared<BlockedMatrix<Scalar>>(jacMat);
+}
+
+#define INSTANTIATE_TYPE(T)                  \
+    template class rocsparsePreconditioner<T,1>; \
+    template class rocsparsePreconditioner<T,2>; \
+    template class rocsparsePreconditioner<T,3>; \
+    template class rocsparsePreconditioner<T,4>; \
+    template class rocsparsePreconditioner<T,5>; \
+    template class rocsparsePreconditioner<T,6>;
+
+INSTANTIATE_TYPE(double)
+
+} //namespace Opm
+
--- a/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
@ -0,0 +1,91 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ROCSPARSEPRECONDITIONER_HEADER_INCLUDED
+#define OPM_ROCSPARSEPRECONDITIONER_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/Preconditioner.hpp>
+
+#include <rocsparse/rocsparse.h>
+
+namespace Opm::Accelerator {
+
+template<class Scalar> class BlockedMatrix;
+
+template <class Scalar, unsigned int block_size>
+class rocsparsePreconditioner : public Preconditioner<Scalar, block_size>
+{
+
+protected:
+    rocsparse_handle handle;
+    rocsparse_direction dir = rocsparse_direction_row;
+    rocsparse_operation operation = rocsparse_operation_none;
+    rocsparse_mat_descr descr_L, descr_U;
+    
+    hipStream_t stream;
+    
+    rocsparsePreconditioner(int verbosity_) :
+    Preconditioner<Scalar, block_size>(verbosity_)
+    {};
+            
+public:
+    
+    int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
+    bool useJacMatrix = false;
+    std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
+    
+    virtual ~rocsparsePreconditioner() = default;
+
+    static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type, 
+                                                                               int verbosity);
+
+    // apply preconditioner, x = prec(y)
+    virtual void apply(Scalar& y, Scalar& x) = 0;
+ 
+    // create/update preconditioner, probably used every linear solve
+    // the version with two params can be overloaded, if not, it will default to using the one param version
+    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
+    
+    virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat,
+                                       BlockedMatrix<Scalar> *jacMat) = 0;
+    
+    virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                            std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                            rocsparse_int *d_Arows,
+                            rocsparse_int *d_Acols) = 0;
+    
+    virtual void copy_system_to_gpu(Scalar *b)=0;
+
+    /// Update linear system to GPU
+    /// \param[in] b              input vector, contains N values
+    virtual void update_system_on_gpu(Scalar *b)=0;
+    
+    void set_matrix_analysis(rocsparse_mat_descr descr_L,
+                             rocsparse_mat_descr descr_U);
+    
+    void set_context(rocsparse_handle handle,
+                     rocsparse_direction dir,
+                     rocsparse_operation operation,
+                     hipStream_t stream);
+    
+    void setJacMat(BlockedMatrix<Scalar> jacMat);
+};
+} //namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp
@ -36,63 +36,22 @@

 #undef HAVE_CUDA

-#include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
-#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp>

 #include <opm/simulators/linalg/bda/BdaResult.hpp>

-#include <hip/hip_runtime_api.h>
-#include <hip/hip_version.h>
+#include <opm/simulators/linalg/bda/Preconditioner.hpp>
+
+#include <opm/simulators/linalg/bda/Misc.hpp>

 #ifdef HIP_HAVE_CUDA_DEFINED
 #define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
 #undef HIP_HAVE_CUDA_DEFINED
 #endif

-#define HIP_CHECK(STAT)                                  \
-    do {                                                 \
-        const hipError_t stat = (STAT);                  \
-        if(stat != hipSuccess)                           \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::hip ";       \
-            oss << "error: " << hipGetErrorString(stat); \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCSPARSE_CHECK(STAT)                            \
-    do {                                                 \
-        const rocsparse_status stat = (STAT);            \
-        if(stat != rocsparse_status_success)             \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::rocsparse "; \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
-#define ROCBLAS_CHECK(STAT)                              \
-    do {                                                 \
-        const rocblas_status stat = (STAT);              \
-        if(stat != rocblas_status_success)               \
-        {                                                \
-            std::ostringstream oss;                      \
-            oss << "rocsparseSolverBackend::rocblas ";   \
-            oss << "error: " << stat;                    \
-            OPM_THROW(std::logic_error, oss.str());      \
-        }                                                \
-    } while(0)
-
 #include <cstddef>

-#if HAVE_OPENMP
-#include <thread>
-#include <omp.h>
-extern std::shared_ptr<std::thread> copyThread;
-#endif //HAVE_OPENMP
-
 namespace Opm::Accelerator {

 using Dune::Timer;
@ -100,10 +59,26 @@ using Dune::Timer;
 template<class Scalar, unsigned int block_size>
 rocsparseSolverBackend<Scalar,block_size>::
 rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
-                       unsigned int platformID_, unsigned int deviceID_)
+                       unsigned int platformID_, unsigned int deviceID_, std::string linsolver)
    : Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
 {
    int numDevices = 0;
+    bool use_cpr, use_isai;
+    
+    if (linsolver.compare("ilu0") == 0) {
+        use_cpr = false;
+        use_isai = false;
+    } else if (linsolver.compare("cpr_quasiimpes") == 0) {
+        use_cpr = true; 
+        use_isai = false;
+    } else if (linsolver.compare("isai") == 0) {
+        OPM_THROW(std::logic_error, "Error rocsparseSolver does not support --linerar-solver=isai");
+    } else if (linsolver.compare("cpr_trueimpes") == 0) {
+        OPM_THROW(std::logic_error, "Error rocsparseSolver does not support --linerar-solver=cpr_trueimpes");
+    } else {
+        OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
+    }
+    
    HIP_CHECK(hipGetDeviceCount(&numDevices));
    if (static_cast<int>(deviceID) >= numDevices) {
        OPM_THROW(std::runtime_error, "Invalid HIP device ID");
@ -124,6 +99,15 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
    HIP_CHECK(hipStreamCreate(&stream));
    ROCSPARSE_CHECK(rocsparse_set_stream(handle, stream));
    ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
+    
+    using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
+    if (use_cpr) {
+        prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity);
+    } else {
+        prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity);
+    }
+    
+    prec->set_context(handle, dir, operation, stream);
 }

 template<class Scalar, unsigned int block_size>
@ -170,17 +154,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
 // HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
 #if HIP_VERSION >= 60000000
    ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
-                                     Nb, Nb, nnzb, &one, descr_M,
+                                     Nb, Nb, nnzb, &one, descr_A,
                                     d_Avals, d_Arows, d_Acols, block_size,
                                     spmv_info, d_x, &zero, d_r));
 #elif HIP_VERSION >= 50400000
    ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
-                                        Nb, Nb, nnzb, &one, descr_M,
+                                        Nb, Nb, nnzb, &one, descr_A,
                                        d_Avals, d_Arows, d_Acols, block_size,
                                        spmv_info, d_x, &zero, d_r));
 #else
    ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
-                                        Nb, Nb, nnzb, &one, descr_M,
+                                        Nb, Nb, nnzb, &one, descr_A,
                                        d_Avals, d_Arows, d_Acols, block_size,
                                        d_x, &zero, d_r));
 #endif
@ -216,13 +200,9 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
            t_prec.start();
        }

-        // apply ilu0
-        ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
-                              operation, Nb, nnzbs_prec, &one, \
-                              descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_p, d_t, rocsparse_solve_policy_auto, d_buffer));
-        ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
-                              operation, Nb, nnzbs_prec, &one, \
-                              descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_pw, rocsparse_solve_policy_auto, d_buffer));
+        // apply ilu0 
+        prec->apply(*d_p, *d_pw);
+        
        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
            t_prec.stop();
@ -232,17 +212,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
        // spmv
 #if HIP_VERSION >= 60000000
        ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
-                                          Nb, Nb, nnzb, &one, descr_M,
+                                          Nb, Nb, nnzb, &one, descr_A,
                                          d_Avals, d_Arows, d_Acols, block_size,
                                          spmv_info, d_pw, &zero, d_v));
 #elif HIP_VERSION >= 50400000
        ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
-                                            Nb, Nb, nnzb, &one, descr_M,
+                                            Nb, Nb, nnzb, &one, descr_A,
                                            d_Avals, d_Arows, d_Acols, block_size,
                                            spmv_info, d_pw, &zero, d_v));
 #else
        ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
-                                            Nb, Nb, nnzb, &one, descr_M,
+                                            Nb, Nb, nnzb, &one, descr_A,
                                            d_Avals, d_Arows, d_Acols, block_size,
                                            d_pw, &zero, d_v));
 #endif
@ -283,12 +263,9 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
        if (verbosity >= 3) {
            t_prec.start();
        }
-        ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
-                              operation, Nb, nnzbs_prec, &one, \
-                              descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_r, d_t, rocsparse_solve_policy_auto, d_buffer));
-        ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
-                              operation, Nb, nnzbs_prec, &one, \
-                              descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_s, rocsparse_solve_policy_auto, d_buffer));
+        
+        prec->apply(*d_r, *d_s);
+       
        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
            t_prec.stop();
@ -298,17 +275,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
        // spmv
 #if HIP_VERSION >= 60000000
        ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
-                                         Nb, Nb, nnzb, &one, descr_M,
+                                         Nb, Nb, nnzb, &one, descr_A,
                                         d_Avals, d_Arows, d_Acols, block_size,
                                         spmv_info, d_s, &zero, d_t));
 #elif HIP_VERSION >= 50400000
        ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
-                                            Nb, Nb, nnzb, &one, descr_M,
+                                            Nb, Nb, nnzb, &one, descr_A,
                                            d_Avals, d_Arows, d_Acols, block_size,
                                            spmv_info, d_s, &zero, d_t));
 #else
        ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
-                                            Nb, Nb, nnzb, &one, descr_M,
+                                            Nb, Nb, nnzb, &one, descr_A,
                                            d_Avals, d_Arows, d_Acols, block_size,
                                            d_s, &zero, d_t));
 #endif
@ -384,14 +361,18 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    this->Nb = matrix->Nb;
-    this->N = Nb * block_size;
+    this->N = this->Nb * block_size;
    this->nnzb = matrix->nnzbs;
-    this->nnz = nnzb * block_size * block_size;
-    nnzbs_prec = nnzb;
+    this->nnz = this->nnzb * block_size * block_size;

    if (jacMatrix) {
-        useJacMatrix = true;
-        nnzbs_prec = jacMatrix->nnzbs;
+        prec->useJacMatrix = true;
+        prec->jacMat = jacMatrix;
+        prec->nnzbs_prec = jacMatrix->nnzbs;
+    } else {
+        prec->useJacMatrix = false;
+        prec->jacMat = matrix;
+        prec->nnzbs_prec = this->nnzb;
    }

    std::ostringstream out;
@ -408,7 +389,6 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
    out.clear();

    mat = matrix;
-    jacMat = jacMatrix;

    HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
    HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
@ -424,15 +404,7 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
    HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
    HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));

-    if (useJacMatrix) {
-        HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
-        HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
-    } else { // preconditioner matrix is same
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
-        d_Mcols = d_Acols;
-        d_Mrows = d_Arows;
-    }
+    prec->initialize(matrix, jacMatrix, d_Arows, d_Acols);//TODO: do we need all parameters?

    initialized = true;
 } // end initialize()
@ -453,28 +425,10 @@ copy_system_to_gpu(Scalar *b)
                             sizeof(Scalar) * nnz,
                             hipMemcpyHostToDevice, stream));
    HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
-    HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
+    HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar),
                             hipMemcpyHostToDevice, stream));
    
-    if (useJacMatrix) {
-#if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
-           copyThread->join();
-        }
-#endif
-        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
-                                 sizeof(rocsparse_int) * (Nb + 1),
-                                 hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
-                                 sizeof(rocsparse_int) * nnzbs_prec,
-                                 hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
-                                 sizeof(Scalar) * nnzbs_prec * block_size * block_size,
-                                 hipMemcpyHostToDevice, stream));
-    } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
-                                 sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
-    }
+    prec->copy_system_to_gpu(d_Avals); 

    if (verbosity >= 3) {
        HIP_CHECK(hipStreamSynchronize(stream));
@ -500,19 +454,8 @@ update_system_on_gpu(Scalar* b)
    HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
                             hipMemcpyHostToDevice, stream));
    
-    if (useJacMatrix) {
-#if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
-            copyThread->join();
-        }
-#endif
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
-                                 sizeof(Scalar) * nnzbs_prec * block_size * block_size,
-                                 hipMemcpyHostToDevice, stream));
-    } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
-                                 sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
-    }
+    prec->update_system_on_gpu(d_Avals);
+    
    if (verbosity >= 3) {
        HIP_CHECK(hipStreamSynchronize(stream));

@ -528,58 +471,15 @@ template<class Scalar, unsigned int block_size>
 bool rocsparseSolverBackend<Scalar,block_size>::
 analyze_matrix()
 {
-    std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    Timer t;

    ROCSPARSE_CHECK(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host));

-    ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
 #if HIP_VERSION >= 50400000
    ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
 #endif

    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_A));
-    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
-
-    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
-    ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
-    ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
-
-    ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
-    ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
-    ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
-
-    ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(handle, dir, Nb, nnzbs_prec,
-                                 descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
-    ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
-                               descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
-    ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
-                               descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
-
-    d_bufferSize = std::max(d_bufferSize_M,
-                            std::max(d_bufferSize_L, d_bufferSize_U));
-
-    HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
-
-    // analysis of ilu LU decomposition
-    ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(handle, dir, \
-                               Nb, nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
-                               block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
-
-    int zero_position = 0;
-    rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
-    if (rocsparse_status_success != status) {
-        printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
-        return false;
-    }
-
-    // analysis of ilu apply
-    ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
-                             Nb, nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
-                             block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
-    ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
-                             Nb, nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
-                             block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));

 #if HIP_VERSION >= 60000000
    ROCSPARSE_CHECK(rocsparse_dbsrmv_analysis(handle, dir, operation,
@ -593,6 +493,13 @@ analyze_matrix()
        block_size, spmv_info));
 #endif

+    if(!prec->analyze_matrix(&*mat)) {
+        std::ostringstream out;
+        out << "Warning: rocsparseSolver matrix analysis failed!";
+        OpmLog::info(out.str());
+        return false;
+    }
+    
    if (verbosity >= 3) {
        HIP_CHECK(hipStreamSynchronize(stream));
        std::ostringstream out;
@ -609,28 +516,7 @@ template<class Scalar, unsigned int block_size>
 bool rocsparseSolverBackend<Scalar,block_size>::
 create_preconditioner()
 {
-    Timer t;
-
-    bool result = true;
-    ROCSPARSE_CHECK(rocsparse_dbsrilu0(handle, dir, Nb, nnzbs_prec, descr_M,
-                    d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
-
-    // Check for zero pivot
-    int zero_position = 0;
-    rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
-    if(rocsparse_status_success != status)
-    {
-        printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
-        return false;
-    }
-
-    if (verbosity >= 3) {
-        HIP_CHECK(hipStreamSynchronize(stream));
-        std::ostringstream out;
-        out << "rocsparseSolver::create_preconditioner(): " << t.stop() << " s";
-        OpmLog::info(out.str());
-    }
-    return result;
+    return prec->create_preconditioner(&*mat);
 } // end create_preconditioner()

 template<class Scalar, unsigned int block_size>
--- a/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
@ -26,6 +26,8 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

+#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
+
 #include <rocblas/rocblas.h>
 #include <rocsparse/rocsparse.h>

@ -57,29 +59,26 @@ private:

    bool analysis_done = false;
    std::shared_ptr<BlockedMatrix<Scalar>> mat{};                 // original matrix
-    std::shared_ptr<BlockedMatrix<Scalar>> jacMat{};              // matrix for preconditioner
-    int nnzbs_prec = 0;    // number of nnz blocks in preconditioner matrix M

    rocsparse_direction dir = rocsparse_direction_row;
    rocsparse_operation operation = rocsparse_operation_none;
    rocsparse_handle handle;
    rocblas_handle blas_handle;
-    rocsparse_mat_descr descr_A, descr_M, descr_L, descr_U;
-    rocsparse_mat_info ilu_info;
+    rocsparse_mat_descr descr_A;
 #if HIP_VERSION >= 50400000
    rocsparse_mat_info spmv_info;
 #endif
    hipStream_t stream;

-    rocsparse_int *d_Arows, *d_Mrows;
-    rocsparse_int *d_Acols, *d_Mcols;
-    Scalar *d_Avals, *d_Mvals;
-    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    rocsparse_int *d_Arows, *d_Acols;
+    Scalar *d_Avals;
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
    Scalar *d_pw, *d_s, *d_t, *d_v;
-    void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
    int  ver;
    char rev[64];

+    std::unique_ptr<rocsparsePreconditioner<Scalar, block_size> > prec; // can perform blocked ILU0 and AMG on pressure component
+    
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
@ -119,14 +118,16 @@ public:
    /// \param[in] tolerance                  required relative tolerance for rocsparseSolver
    /// \param[in] platformID                 the OpenCL platform to be used
    /// \param[in] deviceID                   the device to be used
+    /// \param[in] linsolver                  indicating the preconditioner, equal to the --linear-solver cmdline argument
    rocsparseSolverBackend(int linear_solver_verbosity,
                           int maxit,
                           Scalar tolerance,
                           unsigned int platformID,
-                           unsigned int deviceID);
+                           unsigned int deviceID,
+                           std::string linsolver);

    /// For the CPR coarse solver
-    // rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
+    rocsparseSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance, bool opencl_ilu_reorder);

    /// Destroy a openclSolver, and free memory
    ~rocsparseSolverBackend();
@ -144,10 +145,6 @@ public:
                              WellContributions<Scalar>& wellContribs,
                              BdaResult& res) override;

-    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
-    /// Data is already on the GPU
-    // SolverStatus solve_system(BdaResult &res);
-
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
    void get_result(Scalar* x) override;
--- a/opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp
@ -29,7 +29,7 @@

 #undef HAVE_CUDA

-#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
+#include <opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp>

 #ifdef HIP_HAVE_CUDA_DEFINED
 #define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
@ -40,17 +40,9 @@
 #include <opm/common/ErrorMacros.hpp>

 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
-
+#include <opm/simulators/linalg/bda/Misc.hpp>
 #include <hip/hip_runtime.h>

-#define HIP_CHECK(stat)                               \
-    {                                                 \
-        if(stat != hipSuccess)                        \
-        {                                             \
-            OPM_THROW(std::logic_error, "HIP error"); \
-        }                                             \
-    }
-
 namespace Opm
 {

--- a/opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
--- a/tests/test_csrToCscOffsetMap.cpp
+++ b/tests/test_csrToCscOffsetMap.cpp
@ -9,7 +9,7 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/matrixmarket.hh>

-#include <opm/simulators/linalg/bda/opencl/BISAI.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclBISAI.hpp>

 BOOST_AUTO_TEST_CASE(testcsrtocscoffsetmap){

--- a/tests/test_solvetransposed3x3.cpp
+++ b/tests/test_solvetransposed3x3.cpp
@ -31,7 +31,8 @@

 #include <dune/istl/bcrsmatrix.hh>

-#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
+#include <opm/simulators/linalg/bda/opencl/openclCPR.hpp>
+#include <opm/simulators/linalg/bda/Misc.hpp>

 BOOST_AUTO_TEST_CASE(testsolvetransposed3x3)
 {