Merge pull request #3672 from Tongdongq/add-cpr-opencl

Add CPR preconditioner to openclSolver
2025-02-25 18:55:30 -06:00 · 2021-12-08 19:44:59 +01:00
parent 5926f48b62 a5ed003418
commit a61f69bb79
35 changed files with 2014 additions and 541 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -100,18 +100,21 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/CPR.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/OpenclMatrix.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
 endif()
 if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR HAVE_AMGCL)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Matrix.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
 if(HAVE_FPGA)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
@@ -145,6 +148,7 @@ list (APPEND TEST_SOURCE_FILES
  tests/test_timer.cpp
  tests/test_invert.cpp
  tests/test_stoppedwells.cpp
+  tests/test_solvetransposed3x3.cpp
  tests/test_relpermdiagnostics.cpp
  tests/test_norne_pvt.cpp
  tests/test_wellprodindexcalculator.cpp
@@ -247,10 +251,10 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/BdaSolver.hpp
  opm/simulators/linalg/bda/BILU0.hpp
  opm/simulators/linalg/bda/BlockedMatrix.hpp
+  opm/simulators/linalg/bda/CPR.hpp
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
  opm/simulators/linalg/bda/ChowPatelIlu.hpp
-  opm/simulators/linalg/bda/FPGAMatrix.hpp
  opm/simulators/linalg/bda/FPGABILU0.hpp
  opm/simulators/linalg/bda/FPGASolverBackend.hpp
  opm/simulators/linalg/bda/FPGAUtils.hpp
@@ -258,8 +262,11 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/ILUReorder.hpp
  opm/simulators/linalg/bda/opencl.hpp
  opm/simulators/linalg/bda/openclKernels.hpp
+  opm/simulators/linalg/bda/OpenclMatrix.hpp
+  opm/simulators/linalg/bda/opencl/Preconditioner.hpp
  opm/simulators/linalg/bda/openclSolverBackend.hpp
  opm/simulators/linalg/bda/openclWellContributions.hpp
+  opm/simulators/linalg/bda/Matrix.hpp
  opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
  opm/simulators/linalg/bda/WellContributions.hpp
  opm/simulators/linalg/amgcpr.hh
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -146,7 +146,8 @@ namespace Opm
                const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
                const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
                std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
-                bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
+                std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
+                bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver));
            }
 #else
            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
--- a/opm/simulators/linalg/PreconditionerFactory.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory.hpp
@@ -122,7 +122,6 @@ public:
        instance().doAddCreator(type, creator);
    }

-private:
    using CriterionBase
        = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<Matrix, Dune::Amg::FirstDiagonal>>;
    using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
@@ -150,6 +149,7 @@ private:
        criterion.setMinAggregateSize(prm.get<int>("minaggsize", 4));
        return criterion;
    }
+private:

    /// Helper struct to explicitly overload amgSmootherArgs() version for
    /// ParallelOverlappingILU0, since in-class specialization is not allowed.
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -40,279 +40,251 @@ using Dune::Timer;

 template <unsigned int block_size>
 BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
-    verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
+    Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
 {
 #if CHOW_PATEL
    chowPatelIlu.setVerbosity(verbosity);
 #endif
 }

+
 template <unsigned int block_size>
-BILU0<block_size>::~BILU0()
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
 {
-    delete[] invDiagVals;
-}
+    const unsigned int bs = block_size;

-    template <unsigned int block_size>
-    bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
-    {
-        const unsigned int bs = block_size;
+    this->N = mat->Nb * block_size;
+    this->Nb = mat->Nb;
+    this->nnz = mat->nnzbs * block_size * block_size;
+    this->nnzb = mat->nnzbs;

-        this->N = mat->Nb * block_size;
-        this->Nb = mat->Nb;
-        this->nnz = mat->nnzbs * block_size * block_size;
-        this->nnzbs = mat->nnzbs;
+    std::vector<int> CSCRowIndices;
+    std::vector<int> CSCColPointers;

-        int *CSCRowIndices = nullptr;
-        int *CSCColPointers = nullptr;
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        LUmat = std::make_unique<BlockedMatrix>(*mat);
+    } else {
+        toOrder.resize(Nb);
+        fromOrder.resize(Nb);
+        CSCRowIndices.resize(nnzb);
+        CSCColPointers.resize(Nb + 1);
+        rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
+        LUmat = std::make_unique<BlockedMatrix>(*rmat);

-        if (opencl_ilu_reorder == ILUReorder::NONE) {
-            LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
-        } else {
-            toOrder.resize(Nb);
-            fromOrder.resize(Nb);
-            CSCRowIndices = new int[nnzbs];
-            CSCColPointers = new int[Nb + 1];
-            rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
-            LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
-
-            Timer t_convert;
-            csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
-            if(verbosity >= 3){
-                std::ostringstream out;
-                out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
-                OpmLog::info(out.str());
-            }
+        Timer t_convert;
+        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
+        if (verbosity >= 3) {
+            std::ostringstream out;
+            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+            OpmLog::info(out.str());
        }
+    }

-        Timer t_analysis;
-        std::ostringstream out;
-        if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
-            out << "BILU0 reordering strategy: " << "level_scheduling\n";
-            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
-        } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
-            out << "BILU0 reordering strategy: " << "graph_coloring\n";
-            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
-        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
-            out << "BILU0 reordering strategy: none\n";
-            // numColors = 1;
-            // rowsPerColor.emplace_back(Nb);
-            numColors = Nb;
-            for(int i = 0; i < Nb; ++i){
-                rowsPerColor.emplace_back(1);
-            }
-        } else {
-            OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
-        }
-        if(verbosity >= 1){
-            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+    Timer t_analysis;
+    std::ostringstream out;
+    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
+        out << "BILU0 reordering strategy: " << "level_scheduling\n";
+        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
+        out << "BILU0 reordering strategy: " << "graph_coloring\n";
+        findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+    } else if (opencl_ilu_reorder == ILUReorder::NONE) {
+        out << "BILU0 reordering strategy: none\n";
+        // numColors = 1;
+        // rowsPerColor.emplace_back(Nb);
+        numColors = Nb;
+        for (int i = 0; i < Nb; ++i) {
+            rowsPerColor.emplace_back(1);
        }
+    } else {
+        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
+    }
+    if (verbosity >= 1) {
+        out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+    }
 #if CHOW_PATEL
-        out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
+    out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
 #endif
-        OpmLog::info(out.str());
+    OpmLog::info(out.str());

-
-        if (opencl_ilu_reorder != ILUReorder::NONE) {
-            delete[] CSCRowIndices;
-            delete[] CSCColPointers;
-        }
-
-        diagIndex.resize(mat->Nb);
-        invDiagVals = new double[mat->Nb * bs * bs];
+    diagIndex.resize(mat->Nb);
+    invDiagVals.resize(mat->Nb * bs * bs);

 #if CHOW_PATEL
-        Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
-        Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
 #endif

-        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
-
-        s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
-        s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
-        s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
+    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
+    s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+    s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
 #if CHOW_PATEL
-        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
-        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
-        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
-        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
-        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
-        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+    s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+    s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
 #else
-        s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
-        s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
-        s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
+    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+    s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
+    s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
 #endif

-        events.resize(2);
-        err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
+    events.resize(2);
+    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);

-        rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
-        for (int i = 0; i < numColors; ++i) {
-            rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
-        }
-        err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
+    for (int i = 0; i < numColors; ++i) {
+        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
+    }
+    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);

-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
-        }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }

    return true;
 } // end init()


+template <unsigned int block_size>
+bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    const unsigned int bs = block_size;
+    auto *m = mat;

-    template <unsigned int block_size>
-    bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
-    {
-        const unsigned int bs = block_size;
-        auto *m = mat;
-
-        if (opencl_ilu_reorder != ILUReorder::NONE) {
-            m = rmat.get();
-            Timer t_reorder;
-            reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
-
-            if (verbosity >= 3){
-                std::ostringstream out;
-                out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
-                OpmLog::info(out.str());
-            }
-        }
-
-        // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
-        // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
-        Timer t_copy;
-        memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
-
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 memcpy: " << t_copy.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-#if CHOW_PATEL
-        chowPatelIlu.decomposition(queue, context,
-            LUmat.get(), Lmat.get(), Umat.get(),
-            invDiagVals, diagIndex,
-            s.diagIndex, s.invDiagVals,
-            s.Lvals, s.Lcols, s.Lrows,
-            s.Uvals, s.Ucols, s.Urows);
-#else
-        Timer t_copyToGpu;
-
-        events.resize(1);
-        queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
-
-        std::call_once(pattern_uploaded, [&](){
-            // find the positions of each diagonal block
-            // must be done after reordering
-            for (int row = 0; row < Nb; ++row) {
-                int rowStart = LUmat->rowPointers[row];
-                int rowEnd = LUmat->rowPointers[row+1];
-
-                auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
-                assert(candidate != LUmat->colIndices + rowEnd);
-                diagIndex[row] = candidate - LUmat->colIndices;
-            }
-            events.resize(4);
-            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
-            queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
-            queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
-        });
-
-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
-        }
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        m = rmat.get();
+        Timer t_reorder;
+        reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());

        if (verbosity >= 3) {
            std::ostringstream out;
-            out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        Timer t_decomposition;
-        std::ostringstream out;
-        cl::Event event;
-        for (int color = 0; color < numColors; ++color) {
-            const unsigned int firstRow = rowsPerColorPrefix[color];
-            const unsigned int lastRow = rowsPerColorPrefix[color+1];
-            if (verbosity >= 4) {
-                out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
-            }
-            OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
-        }
-
-        if (verbosity >= 3) {
-            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
-            OpmLog::info(out.str());
-        }
-#endif // CHOW_PATEL
-
-        return true;
-    } // end create_preconditioner()
-
-    // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
-    // however, if individual kernel calls are timed, waiting for events is needed
-    // behavior on other GPUs is untested
-    template <unsigned int block_size>
-    void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
-    {
-        const double relaxation = 0.9;
-        cl::Event event;
-        Timer t_apply;
-
-        for(int color = 0; color < numColors; ++color){
-#if CHOW_PATEL
-            OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
-#else
-            OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
-#endif
-        }
-
-        for(int color = numColors-1; color >= 0; --color){
-#if CHOW_PATEL
-            OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
-#else
-            OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
-#endif
-        }
-
-        // apply relaxation
-        OpenclKernels::scale(x, relaxation, N);
-
-        if (verbosity >= 4) {
-            std::ostringstream out;
-            out << "BILU0 apply: " << t_apply.stop() << " s";
+            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
            OpmLog::info(out.str());
        }
    }

+    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+    // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
+    Timer t_copy;
+    memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);

+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "BILU0 memcpy: " << t_copy.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+#if CHOW_PATEL
+    chowPatelIlu.decomposition(queue, context,
+                               LUmat.get(), Lmat.get(), Umat.get(),
+                               invDiagVals.data(), diagIndex,
+                               s.diagIndex, s.invDiagVals,
+                               s.Lvals, s.Lcols, s.Lrows,
+                               s.Uvals, s.Ucols, s.Urows);
+#else
+    Timer t_copyToGpu;
+
+    events.resize(1);
+    err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+
+    std::call_once(pattern_uploaded, [&]() {
+        // find the positions of each diagonal block
+        // must be done after reordering
+        for (int row = 0; row < Nb; ++row) {
+            int rowStart = LUmat->rowPointers[row];
+            int rowEnd = LUmat->rowPointers[row + 1];
+
+            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+            assert(candidate != LUmat->colIndices + rowEnd);
+            diagIndex[row] = candidate - LUmat->colIndices;
+        }
+        events.resize(4);
+        err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
+        err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+    });
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Timer t_decomposition;
+    std::ostringstream out;
+    cl::Event event;
+    for (int color = 0; color < numColors; ++color) {
+        const unsigned int firstRow = rowsPerColorPrefix[color];
+        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
+        if (verbosity >= 4) {
+            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+        }
+        OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
+    }
+
+    if (verbosity >= 3) {
+        out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+        OpmLog::info(out.str());
+    }
+#endif // CHOW_PATEL
+
+    return true;
+} // end create_preconditioner()
+
+
+// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
+// however, if individual kernel calls are timed, waiting for events is needed
+// behavior on other GPUs is untested
 template <unsigned int block_size>
-void BILU0<block_size>::setOpenCLContext(cl::Context *context_) {
-    this->context = context_;
-}
-template <unsigned int block_size>
-void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
-    this->queue = queue_;
+void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+{
+    const double relaxation = 0.9;
+    cl::Event event;
+    Timer t_apply;
+
+    for (int color = 0; color < numColors; ++color) {
+#if CHOW_PATEL
+        OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
+#else
+        OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
+#endif
+    }
+
+    for (int color = numColors - 1; color >= 0; --color) {
+#if CHOW_PATEL
+        OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
+#else
+        OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
+#endif
+    }
+
+    // apply relaxation
+    OpenclKernels::scale(x, relaxation, N);
+
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "BILU0 apply: " << t_apply.stop() << " s";
+        OpmLog::info(out.str());
+    }
 }


-#define INSTANTIATE_BDA_FUNCTIONS(n)                              \
-template BILU0<n>::BILU0(ILUReorder, int);                        \
-template BILU0<n>::~BILU0();                                      \
-template bool BILU0<n>::init(BlockedMatrix<n>*);                  \
-template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
-template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&);    \
-template void BILU0<n>::setOpenCLContext(cl::Context*);           \
-template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
+
+#define INSTANTIATE_BDA_FUNCTIONS(n) \
+template class BILU0<n>;


 INSTANTIATE_BDA_FUNCTIONS(1);
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -27,6 +27,7 @@

 #include <opm/simulators/linalg/bda/opencl.hpp>
 #include <opm/simulators/linalg/bda/openclKernels.hpp>
+#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
 #include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>


@@ -35,89 +36,86 @@ namespace Opm
 namespace Accelerator
 {

-    /// This class implementa a Blocked ILU0 preconditioner
-    /// The decomposition is done on CPU, and reorders the rows of the matrix
-    template <unsigned int block_size>
-    class BILU0
-    {
+/// This class implements a Blocked ILU0 preconditioner
+/// The decomposition is done on CPU, and reorders the rows of the matrix
+template <unsigned int block_size>
+class BILU0 : public Preconditioner<block_size>
+{
+    typedef Preconditioner<block_size> Base;

-    private:
-        int N;       // number of rows of the matrix
-        int Nb;      // number of blockrows of the matrix
-        int nnz;     // number of nonzeroes of the matrix (scalar)
-        int nnzbs;   // number of blocks of the matrix
-        std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
-        std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::context;
+    using Base::queue;
+    using Base::events;
+    using Base::err;
+
+private:
+    std::unique_ptr<BlockedMatrix> LUmat = nullptr;
+    std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
 #if CHOW_PATEL
-        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
+    std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
-        double *invDiagVals = nullptr;
-        std::vector<int> diagIndex;
-        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
-        std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
-        std::vector<int> toOrder, fromOrder;
-        int numColors;
-        int verbosity;
-        std::once_flag pattern_uploaded;
+    std::vector<double> invDiagVals;
+    std::vector<int> diagIndex;
+    std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
+    std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
+    std::vector<int> toOrder, fromOrder;
+    int numColors;
+    std::once_flag pattern_uploaded;

-        ILUReorder opencl_ilu_reorder;
+    ILUReorder opencl_ilu_reorder;

-        typedef struct {
-            cl::Buffer invDiagVals;
-            cl::Buffer diagIndex;
-            cl::Buffer rowsPerColor;
+    typedef struct {
+        cl::Buffer invDiagVals;
+        cl::Buffer diagIndex;
+        cl::Buffer rowsPerColor;
 #if CHOW_PATEL
-            cl::Buffer Lvals, Lcols, Lrows;
-            cl::Buffer Uvals, Ucols, Urows;
+        cl::Buffer Lvals, Lcols, Lrows;
+        cl::Buffer Uvals, Ucols, Urows;
 #else
-            cl::Buffer LUvals, LUcols, LUrows;
+        cl::Buffer LUvals, LUcols, LUrows;
 #endif
-        } GPU_storage;
+    } GPU_storage;

-        GPU_storage s;
-        cl::Context *context;
-        cl::CommandQueue *queue;
-        std::vector<cl::Event> events;
-        cl_int err;
+    GPU_storage s;

 #if CHOW_PATEL
-        ChowPatelIlu<block_size> chowPatelIlu;
+    ChowPatelIlu<block_size> chowPatelIlu;
 #endif

-    public:
+public:

-        BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
+    BILU0(ILUReorder opencl_ilu_reorder, int verbosity);

-        ~BILU0();
+    // analysis, find reordering if specified
+    bool analyze_matrix(BlockedMatrix *mat) override;

-        // analysis
-        bool init(BlockedMatrix<block_size> *mat);
+    // ilu_decomposition
+    bool create_preconditioner(BlockedMatrix *mat) override;

-        // ilu_decomposition
-        bool create_preconditioner(BlockedMatrix<block_size> *mat);
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y, cl::Buffer& x) override;

-        // apply preconditioner, x = prec(y)
-        void apply(const cl::Buffer& y, cl::Buffer& x);
+    int* getToOrder() override
+    {
+        return toOrder.data();
+    }

-        void setOpenCLContext(cl::Context *context);
-        void setOpenCLQueue(cl::CommandQueue *queue);
+    int* getFromOrder() override
+    {
+        return fromOrder.data();
+    }

-        int* getToOrder()
-        {
-            return toOrder.data();
-        }
+    BlockedMatrix* getRMat() override
+    {
+        return rmat.get();
+    }

-        int* getFromOrder()
-        {
-            return fromOrder.data();
-        }
-
-        BlockedMatrix<block_size>* getRMat()
-        {
-            return rmat.get();
-        }
-
-    };
+};

 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -63,7 +63,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
                                                             double tolerance,
                                                             [[maybe_unused]] unsigned int platformID,
                                                             unsigned int deviceID,
-                                                             [[maybe_unused]] std::string opencl_ilu_reorder)
+                                                             [[maybe_unused]] std::string opencl_ilu_reorder,
+                                                             [[maybe_unused]] std::string linsolver)
 : verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
 {
    if (accelerator_mode.compare("cusparse") == 0) {
@@ -88,7 +89,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
        } else {
            OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
        }
-        backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
+        backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder, linsolver));
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
@@ -168,29 +169,26 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
 // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
 // sparsity pattern should stay the same
 // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
-template <class BridgeMatrix>
-void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
-    int sum_nnzs = 0;
+template <class BridgeMatrix, class BridgeVector, int block_size>
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
+
+    h_rows.clear();
+    h_cols.clear();

    // convert colIndices and rowPointers
-    if (h_rows.empty()) {
-        h_rows.emplace_back(0);
-            for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
-                int size_row = 0;
-                for (auto c = r->begin(); c != r->end(); ++c) {
-                    h_cols.emplace_back(c.index());
-                    size_row++;
-                }
-                sum_nnzs += size_row;
-                h_rows.emplace_back(sum_nnzs);
-            }
-
-        // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
-        if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
-            OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
+    h_rows.emplace_back(0);
+    for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
+        for (auto c = r->begin(); c != r->end(); ++c) {
+            h_cols.emplace_back(c.index());
        }
+        h_rows.emplace_back(h_cols.size());
    }
-} // end getSparsityPattern()
+
+    // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
+    if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
+        OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
+    }
+}


 template <class BridgeMatrix, class BridgeVector, int block_size>
@@ -220,7 +218,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
        if (h_rows.capacity() == 0) {
            h_rows.reserve(Nb+1);
            h_cols.reserve(nnzb);
-            getSparsityPattern(*mat, h_rows, h_cols);
+            copySparsityPatternFromISTL(*mat, h_rows, h_cols);
        }

        Dune::Timer t_zeros;
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -54,7 +54,9 @@ public:
    /// \param[in] platformID                 the OpenCL platform ID to be used
    /// \param[in] deviceID                   the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
-    BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
+    /// \param[in] linsolver                  copy of cmdline argument --linsolver
+    BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
+        unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder, std::string linsolver);


    /// Solve linear system, A*x = b
@@ -75,6 +77,12 @@ public:
        return use_gpu;
    }

+    /// Store sparsity pattern into vectors
+    /// \param[in] mat       input matrix, probably BCRSMatrix
+    /// \param[out] h_rows   rowpointers
+    /// \param[out] h_cols   columnindices
+    static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
+
    /// Initialize the WellContributions object with opencl context and queue
    /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
    /// \param[in] wellContribs   container to hold all WellContributions
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@@ -76,6 +76,7 @@ namespace Accelerator {
        /// \param[in] tolerance                  required relative tolerance for solver
        /// \param[in] platformID                 the OpenCL platform to be used, only used in openclSolver
        /// \param[in] deviceID                   the device to be used
+        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
        BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@@ -26,7 +26,9 @@
 #include <opm/common/ErrorMacros.hpp>

 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>

 namespace Opm
 {
@@ -37,8 +39,7 @@ using Opm::OpmLog;

 /*Sort a row of matrix elements from a blocked CSR-format.*/

-template <unsigned int block_size>
-void sortBlockedRow(int *colIndices, double *data, int left, int right) {
+void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
    const unsigned int bs = block_size;
    int l = left;
    int r = right;
@@ -63,17 +64,16 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right) {
    } while (l < r);

    if (left < r)
-        sortBlockedRow<bs>(colIndices, data, left, r);
+        sortBlockedRow(colIndices, data, left, r, bs);

    if (right > l)
-        sortBlockedRow<bs>(colIndices, data, l, right);
+        sortBlockedRow(colIndices, data, l, right, bs);
 }


 // LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
 // a = a - (b * c)
-template <unsigned int block_size>
-void blockMultSub(double *a, double *b, double *c)
+void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
 {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
@@ -88,8 +88,7 @@ void blockMultSub(double *a, double *b, double *c)

 /*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/

-template <unsigned int block_size>
-void blockMult(double *mat1, double *mat2, double *resMat) {
+void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
            double temp = 0;
@@ -104,8 +103,7 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
 #if HAVE_FPGA

 /*Subtract two blocks from one another element by element*/
-template <unsigned int block_size>
-void blockSub(double *mat1, double *mat2, double *resMat) {
+void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
            resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
@@ -114,8 +112,7 @@ void blockSub(double *mat1, double *mat2, double *resMat) {
 }

 /*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
-template <unsigned int block_size>
-void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
+void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size) {
    for (unsigned int row = 0; row < block_size; row++) {
        if (resetRes) {
            resVect[row] = 0.0;
@@ -128,8 +125,7 @@ void blockVectMult(double *mat, double *vect, double scale, double *resVect, boo



-template <unsigned int block_size>
-int BlockedMatrix<block_size>::countUnblockedNnzs() {
+int BlockedMatrix::countUnblockedNnzs() {
    int numNnzsOverThreshold = 0;
    int totalNnzs = rowPointers[Nb];
    for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
@@ -144,8 +140,7 @@ int BlockedMatrix<block_size>::countUnblockedNnzs() {
 * Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
 * If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
 */
-template <unsigned int block_size>
-void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
+void BlockedMatrix::unblock(Matrix *mat, bool isUMatrix) {
    const unsigned int bs = block_size;
    int valIndex = 0, nnzsPerRow;

@@ -183,8 +178,7 @@ void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {

 /*Optimized version*/
 // ub* prefixes indicate unblocked data
-template <unsigned int block_size>
-int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
+int BlockedMatrix::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
                                     std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
                                     std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
 {
@@ -223,8 +217,7 @@ int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isU
 // PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
 //               if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
 //               then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
-template <unsigned int block_size>
-int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
+int BlockedMatrix::findPartitionColumns(int numColors, int *nodesPerColor,
        int rowsPerColorLimit, int columnsPerColorLimit,
        std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
        std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
@@ -470,41 +463,5 @@ void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::
 #endif // HAVE_FPGA


-
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
-template void sortBlockedRow<n>(int *, double *, int, int);                 \
-template void blockMultSub<n>(double *, double *, double *);                \
-template void blockMult<n>(double *, double *, double *);                   \
-
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-#if HAVE_FPGA
-#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n)                                             \
-template void blockSub<n>(double *, double *, double *);                              \
-template void blockVectMult<n>(double *, double *, double, double *, bool);           \
-template int BlockedMatrix<n>::toRDF(int, int *, bool,                                \
-    std::vector<std::vector<int> >& , int, int *,                                     \
-    std::vector<std::vector<double> >&, short int *, unsigned char *, int *,  int *); \
-template int BlockedMatrix<n>::findPartitionColumns(int, int *,                       \
-        int, int,                                                                     \
-        std::vector<std::vector<int> >& , int *, int *,                               \
-        std::vector<std::vector<int> >& , int *, int *,                               \
-        std::vector<std::vector<int> >& , int *, int *);
-
-INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
-INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
-INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
-INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
-
-#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
-#endif // HAVE_FPGA
-
 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@@ -17,14 +17,14 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef BLOCKED_MATRIX_HPP
-#define BLOCKED_MATRIX_HPP
+#ifndef OPM_BLOCKED_MATRIX_HPP
+#define OPM_BLOCKED_MATRIX_HPP

 #if HAVE_FPGA
 #include <vector>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
 #endif

-#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>

 namespace Opm
 {
@@ -33,7 +33,6 @@ namespace Accelerator

 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
-template<unsigned int block_size>
 class BlockedMatrix
 {

@@ -42,12 +41,14 @@ public:
    /// Allocate BlockedMatrix and data arrays with given sizes
    /// \param[in] Nb               number of blockrows
    /// \param[in] nnzbs            number of nonzero blocks
-    BlockedMatrix(int Nb_, int nnzbs_)
-    : nnzValues(new double[nnzbs_*block_size*block_size]),
-      colIndices(new int[nnzbs_*block_size*block_size]),
+    /// \param[in] block_size       the number of rows and columns for each block
+    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
+    : nnzValues(new double[nnzbs_*block_size_*block_size_]),
+      colIndices(new int[nnzbs_*block_size_*block_size_]),
      rowPointers(new int[Nb_+1]),
      Nb(Nb_),
      nnzbs(nnzbs_),
+      block_size(block_size_),
      deleteNnzs(true),
      deleteSparsity(true)
    {}
@@ -55,11 +56,12 @@ public:
    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
    /// \param[in] M              matrix to be copied
    BlockedMatrix(const BlockedMatrix& M)
-    : nnzValues(new double[M.nnzbs*block_size*block_size]),
+    : nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
      colIndices(M.colIndices),
      rowPointers(M.rowPointers),
      Nb(M.Nb),
      nnzbs(M.nnzbs),
+      block_size(M.block_size),
      deleteNnzs(true),
      deleteSparsity(false)
    {}
@@ -67,15 +69,17 @@ public:
    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
    /// \param[in] Nb             number of blockrows
    /// \param[in] nnzbs          number of nonzero blocks
+    /// \param[in] block_size     the number of rows and columns for each block
    /// \param[in] nnzValues      array of nonzero values, contains nnzb*block_size*block_size scalars
    /// \param[in] colIndices     array of column indices, contains nnzb entries
    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
-    BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
+    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
    : nnzValues(nnzValues_),
      colIndices(colIndices_),
      rowPointers(rowPointers_),
      Nb(Nb_),
      nnzbs(nnzbs_),
+      block_size(block_size_),
      deleteNnzs(false),
      deleteSparsity(false)
    {}
@@ -117,6 +121,7 @@ public:
    int *rowPointers;
    int Nb;
    int nnzbs;
+    unsigned int block_size;
    bool deleteNnzs;
    bool deleteSparsity;
 };
@@ -127,31 +132,45 @@ public:
 /// \param[inout] data           
 /// \param[in] left              lower index of data of row
 /// \param[in] right             upper index of data of row
-template <unsigned int block_size>
-void sortBlockedRow(int *colIndices, double *data, int left, int right);
+/// \param[in] block_size        size of blocks in the row
+void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);

 /// Multiply and subtract blocks
 /// a = a - (b * c)
 /// \param[inout] a              block to be subtracted from
 /// \param[in] b                 input block
 /// \param[in] c                 input block
-template <unsigned int block_size>
-void blockMultSub(double *a, double *b, double *c);
+/// \param[in] block_size        size of block
+void blockMultSub(double *a, double *b, double *c, unsigned int block_size);

-/// Perform a 3x3 matrix-matrix multiplication on two blocks
+/// Perform a matrix-matrix multiplication on two blocks
+/// resMat = mat1 * mat2
 /// \param[in] mat1              input block 1
 /// \param[in] mat2              input block 2
-/// \param[inout] resMat         output block
-template <unsigned int block_size>
-void blockMult(double *mat1, double *mat2, double *resMat);
+/// \param[out] resMat           output block
+/// \param[in] block_size        size of block
+void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);


 #if HAVE_FPGA
-template <unsigned int block_size>
-void blockSub(double *mat1, double *mat2, double *resMat);
+/// Perform a matrix-matrix subtraction on two blocks, element-wise
+/// resMat = mat1 - mat2
+/// \param[in] mat1              input block 1
+/// \param[in] mat2              input block 2
+/// \param[out] resMat           output block
+/// \param[in] block_size        size of block
+void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size);

-template <unsigned int block_size>
-void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
+/// Perform a matrix-vector multiplication
+/// resVect = mat * vect
+/// resVect += mat * vect
+/// \param[in] mat               input matrix
+/// \param[in] vect              input vector
+/// \param[in] scale             multiply output with this factor
+/// \param[inout] resVect        output vector
+/// \param[in] resetRes          if true, overwrite resVect, otherwise add to it
+/// \param[in] block_size        size of block
+void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size);

 /// Convert a blocked inverse diagonal to the FPGA format.
 /// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
--- a/opm/simulators/linalg/bda/CPR.cpp
+++ b/opm/simulators/linalg/bda/CPR.cpp
@@ -0,0 +1,535 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <dune/common/shared_ptr.hh>
+
+#include <opm/simulators/linalg/PreconditionerFactory.hpp>
+#include <opm/simulators/linalg/PropertyTree.hpp>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/CPR.hpp>
+#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
+
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <unsigned int block_size>
+CPR<block_size>::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) :
+    Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
+{
+    bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
+    diagIndices.resize(1);
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+
+    context = context_;
+    queue = queue_;
+
+    bilu0->setOpencl(context, queue);
+}
+
+
+template <unsigned int block_size>
+bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
+
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_);
+
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        mat = mat_;
+    } else {
+        mat = bilu0->getRMat();
+    }
+    return success;
+}
+
+
+template <unsigned int block_size>
+bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    return result;
+}
+
+// return the absolute value of the N elements for which the absolute value is highest
+double get_absmax(const double *data, const int N) {
+    return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
+}
+
+
+// solve A^T * x = b
+void solve_transposed_3x3(const double *A, const double *b, double *x) {
+    const int B = 3;
+    // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
+    double t4  = A[0+0*B] * A[1+1*B];
+    double t6  = A[0+0*B] * A[1+2*B];
+    double t8  = A[0+1*B] * A[1+0*B];
+    double t10 = A[0+2*B] * A[1+0*B];
+    double t12 = A[0+1*B] * A[2+0*B];
+    double t14 = A[0+2*B] * A[2+0*B];
+
+    double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
+          t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
+
+    x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
+          - b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
+          + b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
+
+    x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
+          - A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
+          + A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
+
+    x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
+          - A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
+          + A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::init_opencl_buffers() {
+    d_Amatrices.reserve(num_levels);
+    d_Rmatrices.reserve(num_levels - 1);
+    d_invDiags.reserve(num_levels - 1);
+    for (Matrix& m : Amatrices) {
+        d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
+    }
+    for (Matrix& m : Rmatrices) {
+        d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
+        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
+        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
+
+        d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
+        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
+        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
+    }
+    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+    d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
+    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
+    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::opencl_upload() {
+    d_mat->upload(queue.get(), mat);
+
+    err = CL_SUCCESS;
+    events.resize(2 * Rmatrices.size() + 1);
+    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
+    for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
+        d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
+
+        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
+        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
+    }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
+    }
+    for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
+        d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
+    }
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
+    this->mat = mat_;
+
+    coarse_vals.resize(nnzb);
+    coarse_x.resize(Nb);
+    coarse_y.resize(Nb);
+    weights.resize(N);
+
+    try{
+        double rhs[] = {0, 0, 0};
+        rhs[pressure_idx] = 1;
+
+        // find diagonal index for each row
+        if (diagIndices[0].empty()) {
+            diagIndices[0].resize(Nb);
+            for (int row = 0; row < Nb; ++row) {
+                int start = mat->rowPointers[row];
+                int end = mat->rowPointers[row + 1];
+                auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
+                assert(candidate != mat->colIndices + end);
+                diagIndices[0][row] = candidate - mat->colIndices;
+            }
+        }
+
+        // calculate weights for each row
+        for (int row = 0; row < Nb; ++row) {
+            // solve to find weights
+            double *row_weights = weights.data() + block_size * row; // weights for this row
+            solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
+
+            // normalize weights for this row
+            double abs_max = get_absmax(row_weights, block_size);
+            for(unsigned int i = 0; i < block_size; i++){
+                row_weights[i] /= abs_max;
+            }
+        }
+
+        // extract pressure
+        // transform blocks to scalars to create scalar linear system
+        for (int row = 0; row < Nb; ++row) {
+            int start = mat->rowPointers[row];
+            int end = mat->rowPointers[row + 1];
+            for (int idx = start; idx < end; ++idx) {
+                double *block = mat->nnzValues + idx * block_size * block_size;
+                double *row_weights = weights.data() + block_size * row;
+                double value = 0.0;
+                for (unsigned int i = 0; i < block_size; ++i) {
+                    value += block[block_size * i + pressure_idx] * row_weights[i];
+                }
+                coarse_vals[idx] = value;
+            }
+        }
+
+        using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
+        using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
+        if (recalculate_aggregates) {
+            dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
+
+            typedef DuneMat::CreateIterator Iter;
+
+            // setup sparsity pattern
+            for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
+                int start = mat->rowPointers[row.index()];
+                int end = mat->rowPointers[row.index() + 1];
+                for (int idx = start; idx < end; ++idx) {
+                    int col = mat->colIndices[idx];
+                    row.insert(col);
+                }
+            }
+
+            // set values
+            for (int row = 0; row < Nb; ++row) {
+                int start = mat->rowPointers[row];
+                int end = mat->rowPointers[row + 1];
+                for (int idx = start; idx < end; ++idx) {
+                    int col = mat->colIndices[idx];
+                    (*dune_coarse)[row][col] = coarse_vals[idx];
+                }
+            }
+
+            dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
+            Dune::Amg::SequentialInformation seqinfo;
+            dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
+
+            Opm::PropertyTree property_tree;
+            property_tree.put("alpha", 0.333333333333);
+
+            // The matrix has a symmetric sparsity pattern, but the values are not symmetric
+            // Yet a SymmetricDependency is used in AMGCPR
+            // An UnSymmetricCriterion is also available
+            // using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
+            using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
+            using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
+            const Criterion c = Opm::PreconditionerFactory<MatrixOperator, Dune::Amg::SequentialInformation>::amgCriterion(property_tree);
+            num_pre_smooth_steps = c.getNoPreSmoothSteps();
+            num_post_smooth_steps = c.getNoPostSmoothSteps();
+
+            dune_amg->build<OverlapFlags>(c);
+
+            analyzeHierarchy();
+            analyzeAggregateMaps();
+
+            recalculate_aggregates = false;
+        } else {
+            // update values of coarsest level in AMG
+            // this works because that level is actually a reference to the DuneMat held by dune_coarse
+            for (int row = 0; row < Nb; ++row) {
+                int start = mat->rowPointers[row];
+                int end = mat->rowPointers[row + 1];
+                for (int idx = start; idx < end; ++idx) {
+                    int col = mat->colIndices[idx];
+                    (*dune_coarse)[row][col] = coarse_vals[idx];
+                }
+            }
+
+            // update the rest of the AMG hierarchy
+            dune_amg->recalculateGalerkin(OverlapFlags());
+            analyzeHierarchy();
+        }
+
+        // initialize OpenclMatrices and Buffers if needed
+        auto init_func = std::bind(&CPR::init_opencl_buffers, this);
+        std::call_once(opencl_buffers_allocated, init_func);
+
+        // upload matrices and vectors to GPU
+        opencl_upload();
+
+    } catch (const std::exception& ex) {
+        std::cerr << "Caught exception: " << ex.what() << std::endl;
+        throw ex;
+    }
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::analyzeHierarchy() {
+    const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
+
+    // store coarsest AMG level in umfpack format, also performs LU decomposition
+    umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
+
+    num_levels = dune_amg->levels();
+    level_sizes.resize(num_levels);
+    diagIndices.resize(num_levels);
+
+    Amatrices.reserve(num_levels);
+    Rmatrices.reserve(num_levels - 1);  // coarsest level does not need one
+    invDiags.reserve(num_levels);
+
+    Amatrices.clear();
+    invDiags.clear();
+
+    // matrixIter.dereference() returns MatrixAdapter
+    // matrixIter.dereference().getmat() returns BCRSMat
+    DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
+    for(int level = 0; level < num_levels; ++matrixIter, ++level) {
+        const auto& A = matrixIter.dereference().getmat();
+        level_sizes[level] = A.N();
+        diagIndices[level].reserve(A.N());
+
+        // extract matrix A
+        Amatrices.emplace_back(A.N(), A.nonzeroes());
+        // contiguous copy is not possible
+        // std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
+        // also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
+        int nnz_idx = 0;
+        const bool fillDiagIndices = diagIndices[level].empty();
+        for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
+            for (auto c = r->begin(); c != r->end(); ++c) {
+                Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
+                if (fillDiagIndices && r.index() == c.index()) {
+                    diagIndices[level].emplace_back(nnz_idx);
+                }
+                nnz_idx++;
+            }
+        }
+
+        Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
+
+        // compute inverse diagonal values for current level
+        invDiags.emplace_back(A.N());
+        for (unsigned int row = 0; row < A.N(); ++row) {
+            invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
+        }
+    }
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::analyzeAggregateMaps() {
+
+    PcolIndices.resize(num_levels - 1);
+    Rmatrices.clear();
+
+    const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
+
+    DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
+    for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
+        DuneAmg::AggregatesMap *map = *mapIter;
+
+        Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
+        std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
+
+        // get indices for each row of P and R
+        std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
+        PcolIndices[level].resize(level_sizes[level]);
+
+        using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
+        for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
+            if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
+                const long int diff = ai - map->begin();
+                PcolIndices[level][diff] = *ai;
+                indicesR[*ai].emplace_back(diff);
+            }
+        }
+
+        int col_idx = 0;
+        // set sparsity pattern of R
+        Rmatrices.back().rowPointers[0] = 0;
+        for (unsigned int i = 0; i < indicesR.size(); ++i) {
+            Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
+            for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
+                Rmatrices.back().colIndices[col_idx++] = *it;
+            }
+        }
+    }
+}
+
+
+template <unsigned int block_size>
+void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
+    OpenclMatrix *A = &d_Amatrices[level];
+    OpenclMatrix *R = &d_Rmatrices[level];
+    int Ncur = A->Nb;
+
+    if (level == num_levels - 1) {
+        // solve coarsest level
+        std::vector<double> h_y(Ncur), h_x(Ncur, 0);
+
+        events.resize(1);
+        err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
+        }
+
+        // solve coarsest level using umfpack
+        umfpack.apply(h_x.data(), h_y.data());
+
+        events.resize(1);
+        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
+        }
+        return;
+    }
+    int Nnext = d_Amatrices[level+1].Nb;
+
+    cl::Buffer& t = d_t[level];
+    cl::Buffer& f = d_f[level];
+    cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
+
+    // presmooth
+    double jacobi_damping = 0.65; // default value in amgcl: 0.72
+    for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
+        OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+        OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+    }
+
+    // move to coarser level
+    OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+    OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
+    amg_cycle_gpu(level + 1, f, u);
+    OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
+
+    // postsmooth
+    for (unsigned i = 0; i < num_post_smooth_steps; ++i){
+        OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+        OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+    }
+}
+
+
+// x = prec(y)
+template <unsigned int block_size>
+void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
+    // 0-initialize u and x vectors
+    events.resize(d_u.size() + 1);
+    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
+    for (unsigned int i = 0; i < d_u.size(); ++i) {
+        err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
+    }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
+    }
+
+    OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
+    OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
+
+    amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
+
+    OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
+}
+
+template <unsigned int block_size>
+void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
+    Dune::Timer t_bilu0;
+    bilu0->apply(y, x);
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "CPR apply bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    apply_amg(y, x);
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "CPR apply amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)  \
+template class CPR<n>;
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+INSTANTIATE_BDA_FUNCTIONS(5);
+INSTANTIATE_BDA_FUNCTIONS(6);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace Accelerator
+} // namespace Opm
+
+
--- a/opm/simulators/linalg/bda/CPR.hpp
+++ b/opm/simulators/linalg/bda/CPR.hpp
@@ -0,0 +1,163 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_CPR_HPP
+#define OPM_CPR_HPP
+
+#include <mutex>
+
+#include <dune/istl/paamg/matrixhierarchy.hh>
+#include <dune/istl/umfpack.hh>
+
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
+#include <opm/simulators/linalg/bda/ILUReorder.hpp>
+#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+template <unsigned int block_size>
+class openclSolverBackend;
+
+class BlockedMatrix;
+
+/// This class implements a Constrained Pressure Residual (CPR) preconditioner
+template <unsigned int block_size>
+class CPR : public Preconditioner<block_size>
+{
+    typedef Preconditioner<block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::context;
+    using Base::queue;
+    using Base::events;
+    using Base::err;
+
+private:
+    int num_levels;
+    std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
+    std::vector<Matrix> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<OpenclMatrix> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
+    std::vector<cl::Buffer> d_PcolIndices;
+    std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
+    std::vector<cl::Buffer> d_invDiags;
+    std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
+    std::unique_ptr<cl::Buffer> d_rs;      // use before extracting the pressure
+    std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
+    std::unique_ptr<OpenclMatrix> d_mat;   // stores blocked matrix
+    std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
+    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once
+
+    std::unique_ptr<BILU0<block_size> > bilu0;                    // Blocked ILU0 preconditioner
+    BlockedMatrix *mat = nullptr;    // input matrix, blocked
+
+    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
+    using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
+    using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
+    using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
+    std::unique_ptr<DuneAmg> dune_amg;
+    std::unique_ptr<DuneMat> dune_coarse;       // extracted pressure matrix, finest level in AMG hierarchy
+    std::shared_ptr<MatrixOperator> dune_op;    // operator, input to Dune AMG
+    std::vector<int> level_sizes;               // size of each level in the AMG hierarchy
+    std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
+    Dune::UMFPack<DuneMat> umfpack;             // dune/istl/umfpack object used to solve the coarsest level of AMG
+    bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
+    bool recalculate_aggregates = true;         // only rerecalculate if true
+    const int pressure_idx = 1;                 // hardcoded to mimic OPM
+    unsigned num_pre_smooth_steps;              // number of Jacobi smooth steps before restriction
+    unsigned num_post_smooth_steps;             // number of Jacobi smooth steps after prolongation
+
+    std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
+    ILUReorder opencl_ilu_reorder;                          // reordering strategy for ILU0 in coarse solver
+
+    // Analyze the AMG hierarchy build by Dune
+    void analyzeHierarchy();
+
+    // Analyze the aggregateMaps from the AMG hierarchy
+    // These can be reused, so only use when recalculate_aggregates is true
+    void analyzeAggregateMaps();
+
+    // Initialize and allocate matrices and vectors
+    void init_opencl_buffers();
+
+    // Copy matrices and vectors to GPU
+    void opencl_upload();
+
+    // apply pressure correction to vector
+    void apply_amg(const cl::Buffer& y, cl::Buffer& x);
+
+    void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
+
+    void create_preconditioner_amg(BlockedMatrix *mat);
+
+public:
+
+    CPR(int verbosity, ILUReorder opencl_ilu_reorder);
+
+    bool analyze_matrix(BlockedMatrix *mat) override;
+
+    // set own Opencl variables, but also that of the bilu0 preconditioner
+    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
+
+    // applies blocked ilu0
+    // also applies amg for pressure component
+    void apply(const cl::Buffer& y, cl::Buffer& x) override;
+
+    bool create_preconditioner(BlockedMatrix *mat) override;
+
+    int* getToOrder() override
+    {
+        return bilu0->getToOrder();
+    }
+
+    int* getFromOrder() override
+    {
+        return bilu0->getFromOrder();
+    }
+
+    BlockedMatrix* getRMat() override
+    {
+        return bilu0->getRMat();
+    }
+};
+
+// solve A^T * x = b
+// A should represent a 3x3 matrix
+// x and b are vectors with 3 elements
+void solve_transposed_3x3(const double *A, const double *b, double *x);
+
+} // namespace Accelerator
+} // namespace Opm
+
+#endif
+
--- a/opm/simulators/linalg/bda/ChowPatelIlu.cpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
@@ -482,7 +482,7 @@ __kernel void chow_patel_ilu_sweep(
 template <unsigned int block_size>
 void ChowPatelIlu<block_size>::decomposition(
    cl::CommandQueue *queue, [[maybe_unused]] cl::Context *context,
-    BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
+    BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
    double *invDiagVals, std::vector<int>& diagIndex,
    cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
    cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
@@ -569,7 +569,7 @@ void ChowPatelIlu<block_size>::decomposition(
            } else {
                Lmat->colIndices[num_blocks_L] = j;
                // multiply block of L with corresponding diag block
-                blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
+                blockMult(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs, bs);
                num_blocks_L++;
            }
        }
@@ -671,7 +671,7 @@ void ChowPatelIlu<block_size>::decomposition(
                        }
                        if (colL == rowU2) {
                            // Aij -= (Lik * Ukj)
-                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
+                            blockMultSub(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs, bs);
                        }
                    }
                }
@@ -713,7 +713,7 @@ void ChowPatelIlu<block_size>::decomposition(

                    if (rowU == colL) {
                        // Aij -= (Lik * Ukj)
-                        blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
+                        blockMultSub(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs, bs);
                    }
                }

@@ -721,7 +721,7 @@ void ChowPatelIlu<block_size>::decomposition(
                double ujj[bs*bs];
                inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
                // Lij_new = (Aij - sum) / Ujj
-                blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
+                blockMult(&aij[0], &ujj[0], Ltmp + ij * bs * bs, bs);
            }
        }
        // 1st sweep writes to Ltmp
--- a/opm/simulators/linalg/bda/ChowPatelIlu.hpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.hpp
@@ -82,7 +82,7 @@ public:
    /// This function calls gpu_decomposition() if CHOW_PATEL_GPU is set
    void decomposition(
        cl::CommandQueue *queue, cl::Context *context,
-        BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
+        BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
        double *invDiagVals, std::vector<int>& diagIndex,
        cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
        cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
--- a/opm/simulators/linalg/bda/FPGABILU0.cpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.cpp
@@ -59,7 +59,7 @@ FPGABILU0<block_size>::~FPGABILU0()


 template <unsigned int block_size>
-bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
+bool FPGABILU0<block_size>::init(BlockedMatrix *mat)
 {
    const unsigned int bs = block_size;

@@ -91,8 +91,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
    }

    Timer t_analysis;
-    rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
-    LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
+    rMat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
+    LUMat = std::make_unique<BlockedMatrix>(*rMat);
    std::ostringstream out;
    if (level_scheduling) {
        out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
@@ -117,7 +117,7 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
    int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
    int blockDiagSize = 0;
    // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
-    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
+    reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());
    int doneRows = 0;
    for (int c = 0; c < numColors; c++) {
        for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
@@ -187,8 +187,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)

    diagIndex.resize(mat->Nb, 0);
    invDiagVals = new double[mat->Nb * bs * bs];
-    LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
-    UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    LMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
+    UMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
    resultPointers[0] = (void *) colorSizes.data();
    resultPointers[1] = (void *) PIndicesAddr.data();
    resultPointers[2] = (void *) nnzValues.data();
@@ -232,11 +232,11 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)


 template <unsigned int block_size>
-bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
+bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
 {
    const unsigned int bs = block_size;
    Timer t_reorder;
-    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
+    reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());

    if (verbosity >= 3) {
        std::ostringstream out;
@@ -286,7 +286,7 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat

            LSize++;
            // calculate the pivot of this row
-            blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
+            blockMult(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0], bs);

            memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);

@@ -296,7 +296,7 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
            // subtract that row scaled by the pivot from this row.
            while (ik < iRowEnd && jk < jRowEnd) {
                if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
-                    blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
+                    blockMultSub(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs, bs);
                    ik++;
                    jk++;
                } else {
@@ -402,8 +402,8 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
 template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int);  \
 template FPGABILU0<n>::~FPGABILU0();                                    \
-template bool FPGABILU0<n>::init(BlockedMatrix<n> *);                   \
-template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *);  \
+template bool FPGABILU0<n>::init(BlockedMatrix*);                       \
+template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix *);

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/FPGABILU0.hpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.hpp
@@ -45,8 +45,8 @@ private:
    int Nb;      // number of blockrows of the matrix
    int nnz;     // number of nonzeroes of the matrix (scalar)
    int nnzbs;   // number of blocks of the matrix
-    std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
-    std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
+    std::unique_ptr<BlockedMatrix> LMat = nullptr, UMat = nullptr, LUMat = nullptr;
+    std::shared_ptr<BlockedMatrix> rMat = nullptr; // reordered mat
    double *invDiagVals = nullptr;
    std::vector<int> diagIndex;
    std::vector<int> toOrder, fromOrder;
@@ -82,10 +82,10 @@ public:
    ~FPGABILU0();

    // analysis (optional)
-    bool init(BlockedMatrix<block_size> *mat);
+    bool init(BlockedMatrix *mat);

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix<block_size> *mat);
+    bool create_preconditioner(BlockedMatrix *mat);

    int* getToOrder()
    {
@@ -97,7 +97,7 @@ public:
        return fromOrder.data();
    }

-    BlockedMatrix<block_size>* getRMat()
+    BlockedMatrix* getRMat()
    {
        return rMat.get();
    }
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@@ -20,6 +20,7 @@
 #include <config.h>

 #include <cmath>
+#include <cstring>

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/material/common/Unused.hpp>
@@ -261,7 +262,7 @@ void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double

    // allocate host memory for matrices and vectors
    // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
-    mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
+    mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));

    std::ostringstream oss;
    oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@@ -56,8 +56,8 @@ private:
    bool level_scheduling = false;

    // LUMat will shallow copy rowPointers and colIndices of mat/rMat
-    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
-    BlockedMatrix<block_size> *rMat = nullptr;
+    std::unique_ptr<BlockedMatrix> mat = nullptr;
+    BlockedMatrix *rMat = nullptr;
    std::unique_ptr<Preconditioner> prec = nullptr;

    // vectors with data processed by the preconditioner (input to the kernel)
--- a/opm/simulators/linalg/bda/FPGAMatrix.cpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.cpp
@@ -17,10 +17,13 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <config.h>
+
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>

-#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>

 namespace Opm
@@ -57,6 +60,7 @@ void sortRow(int *colIndices, double *data, int left, int right) {

 }

+#if HAVE_FPGA
 /*
 * Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
 * - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
@@ -247,6 +251,8 @@ int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,

    return 0;
 }
+#endif
+

 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/FPGAMatrix.hpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.hpp
@@ -17,8 +17,8 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef FPGA_MATRIX_HEADER_INCLUDED
-#define FPGA_MATRIX_HEADER_INCLUDED
+#ifndef OPM_MATRIX_HEADER_INCLUDED
+#define OPM_MATRIX_HEADER_INCLUDED

 #include <vector>

@@ -33,25 +33,30 @@ class Matrix {

 public:

-    /// Allocate Matrix and data arrays with given sizes
+    /// Allocate square Matrix and data arrays with given sizes
    /// \param[in] N               number of rows
    /// \param[in] nnzs            number of nonzeros
    Matrix(int N_, int nnzs_)
-    : nnzValues(new double[nnzs_]),
-      colIndices(new int[nnzs_]),
-      rowPointers(new int[N_+1]),
-      N(N_),
+    : N(N_),
+      M(N_),
      nnzs(nnzs_)
-    {}
-
-    /// All constructors allocate new memory, so always delete here
-    ~Matrix(){
-        delete[] nnzValues;
-        delete[] colIndices;
-        delete[] rowPointers;
+    {
+        nnzValues.resize(nnzs);
+        colIndices.resize(nnzs);
+        rowPointers.resize(N+1);
    }

+    /// Allocate rectangular Matrix and data arrays with given sizes
+    /// \param[in] N               number of rows
+    /// \param[in] M               number of columns
+    /// \param[in] nnzs            number of nonzeros
+    Matrix(int N_, int M_, int nnzs_)
+    : Matrix(N_, nnzs_)
+    {
+        M = M_;
+    }

+#if HAVE_FPGA
    /// Converts this matrix to the dataformat used by the FPGA.
    /// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
    /// The purpose of this format is to allow the data to be streamable.
@@ -71,11 +76,12 @@ public:
    int toRDF(int numColors, std::vector<int>& nodesPerColor,
        std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, 
        std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
+#endif

-    double *nnzValues;
-    int *colIndices;
-    int *rowPointers;
-    int N;
+    std::vector<double> nnzValues;
+    std::vector<int> colIndices;
+    std::vector<int> rowPointers;
+    int N, M;
    int nnzs;
 };

@@ -84,4 +90,4 @@ void sortRow(int *colIndices, double *data, int left, int right);
 } // namespace Accelerator
 } // namespace Opm

-#endif // FPGA_MATRIX_HEADER_INCLUDED
+#endif // OPM_MATRIX_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/OpenclMatrix.cpp
+++ b/opm/simulators/linalg/bda/OpenclMatrix.cpp
@@ -0,0 +1,66 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/Matrix.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
+    std::vector<cl::Event> events(3);
+
+    cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
+    err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "OpenclMatrix OpenCL enqueueWriteBuffer error");
+    }
+}
+
+void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
+    if (block_size != 1) {
+        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
+    }
+
+    upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
+}
+
+void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
+    if (matrix->block_size != block_size) {
+        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
+    }
+
+    upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
+}
+
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/OpenclMatrix.hpp
+++ b/opm/simulators/linalg/bda/OpenclMatrix.hpp
@@ -0,0 +1,66 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_OPENCLMATRIX_HEADER_INCLUDED
+#define OPM_OPENCLMATRIX_HEADER_INCLUDED
+
+#include <vector>
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+class Matrix;
+class BlockedMatrix;
+
+/// This struct resembles a csr matrix, only doubles are supported
+/// The matrix data is stored in OpenCL Buffers
+class OpenclMatrix {
+public:
+
+    OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
+    : Nb(Nb_),
+      Mb(Mb_),
+      nnzbs(nnzbs_),
+      block_size(block_size_)
+    {
+        nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
+        colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
+        rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
+    }
+
+    void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
+    void upload(cl::CommandQueue *queue, Matrix *matrix);
+    void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
+
+    cl::Buffer nnzValues;
+    cl::Buffer colIndices;
+    cl::Buffer rowPointers;
+    int Nb, Mb;
+    int nnzbs;
+    unsigned int block_size;
+};
+
+} // namespace Accelerator
+} // namespace Opm
+
+#endif // OPM_OPENCLMATRIX_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@@ -177,10 +177,10 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
 /* Reorder a matrix by a specified input order.
 * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
 * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
+    assert(mat->block_size == rmat->block_size);

-template <unsigned int block_size>
-void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat) {
-    const unsigned int bs = block_size;
+    const unsigned int bs = mat->block_size;
    int rIndex = 0;
    int i, k;
    unsigned int j;
@@ -204,7 +204,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
    }
    // re-sort the column indices of every row.
    for (i = 0; i < mat->Nb; i++) {
-        sortBlockedRow<bs>(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1);
+        sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
    }
 }

@@ -370,7 +370,6 @@ void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices

 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                            \
 template int colorBlockedNodes<n>(int, const int *, const int *, const int *, const int *, std::vector<int>&, int, int);                        \
-template void reorderBlockedMatrixByPattern<n>(BlockedMatrix<n> *, int *, int *, BlockedMatrix<n> *);                                           \
 template void reorderBlockedVectorByPattern<n>(int, double*, int*, double*);                                                                    \
 template void findGraphColoring<n>(const int *, const int *, const int *, const int *, int, int, int, int *, int *, int *, std::vector<int>&);  \

--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@@ -52,9 +52,8 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
 /// \param[in] mat           matrix to be reordered
 /// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
 /// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rMat       reordered Matrix 
-template <unsigned int block_size>
-void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat);
+/// \param[inout] rMat       reordered Matrix
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);

 /// Compute reorder mapping from the color that each node has received
 /// The toOrder, fromOrder and iters arrays must be allocated already
--- a/opm/simulators/linalg/bda/opencl.cpp
+++ b/opm/simulators/linalg/bda/opencl.cpp
@@ -21,6 +21,11 @@
 #include <opm/simulators/linalg/bda/opencl.hpp>
 #include <string>

+namespace Opm
+{
+namespace Accelerator
+{
+
 /// Translate OpenCL error codes to strings
 /// Integer - String combinations are defined in CL/cl.h
 /// \param[in] error     error code
@@ -95,3 +100,6 @@ std::string getErrorString(cl_int error)
        default: return "UNKNOWN_CL_CODE";
    }
 }
+
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/opencl.hpp
+++ b/opm/simulators/linalg/bda/opencl.hpp
@@ -33,7 +33,16 @@

 #include <string>

+
+namespace Opm
+{
+namespace Accelerator
+{
+
 /// Translate OpenCL error codes to strings
 /// Integer - String combinations are defined in CL/cl.h
 /// \param[in] error     error code
 std::string getErrorString(cl_int error);
+
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@@ -0,0 +1,68 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <memory>
+
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/CPR.hpp>
+#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+
+template <unsigned int block_size>
+void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+    context = context_;
+    queue = queue_;
+}
+
+template <unsigned int block_size>
+std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder) {
+    if (type == PreconditionerType::BILU0) {
+        return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_reorder, verbosity);
+    } else if (type == PreconditionerType::CPR) {
+        return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_reorder);
+    } else {
+        OPM_THROW(std::logic_error, "Invalid PreconditionerType");
+    }
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)  \
+template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);  \
+template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+INSTANTIATE_BDA_FUNCTIONS(5);
+INSTANTIATE_BDA_FUNCTIONS(6);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} //namespace Accelerator
+} //namespace Opm
+
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@@ -0,0 +1,86 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_PRECONDITIONER_HEADER_INCLUDED
+#define OPM_PRECONDITIONER_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/ILUReorder.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+
+class BlockedMatrix;
+
+template <unsigned int block_size>
+class Preconditioner
+{
+
+protected:
+    int N = 0;       // number of rows of the matrix
+    int Nb = 0;      // number of blockrows of the matrix
+    int nnz = 0;     // number of nonzeroes of the matrix (scalar)
+    int nnzb = 0;    // number of blocks of the matrix
+    int verbosity = 0;
+
+    std::shared_ptr<cl::Context> context;
+    std::shared_ptr<cl::CommandQueue> queue;
+    std::vector<cl::Event> events;
+    cl_int err;
+
+    Preconditioner(int verbosity_) :
+    verbosity(verbosity_)
+    {};
+
+public:
+    enum PreconditionerType {
+        BILU0,
+        CPR
+    };
+
+    static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder);
+
+    // nested Preconditioners might need to override this
+    virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
+
+    // apply preconditioner, x = prec(y)
+    virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
+
+    // analyze matrix, e.g. the sparsity pattern
+    // probably only called once
+    virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
+
+    // create/update preconditioner, probably used every linear solve
+    virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
+
+    // get reordering mappings
+    virtual int* getToOrder() = 0;
+    virtual int* getFromOrder() = 0;
+
+    // get reordered matrix
+    virtual BlockedMatrix* getRMat() = 0;
+};
+
+} //namespace Accelerator
+} //namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/openclKernels.cpp
@@ -22,6 +22,7 @@
 #include <sstream>

 #include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/openclKernels.hpp>
@@ -45,8 +46,16 @@ std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const u
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
+std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
-std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
+std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
+std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
+std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
+std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
+std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
+std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
+std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
 std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
 std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
 std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
@@ -60,7 +69,7 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
    return A / B + (A % B > 0);
 }

-void add_kernel_string(cl::Program::Sources &sources, const std::string &source) {
+void add_kernel_source(cl::Program::Sources &sources, const std::string &source) {
    sources.emplace_back(source);
 }

@@ -75,33 +84,49 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    verbosity = verbosity_;

    cl::Program::Sources sources;
-    const std::string& axpy_s = get_axpy_string();
-    add_kernel_string(sources, axpy_s);
-    const std::string& scale_s = get_scale_string();
-    add_kernel_string(sources, scale_s);
-    const std::string& dot_1_s = get_dot_1_string();
-    add_kernel_string(sources, dot_1_s);
-    const std::string& norm_s = get_norm_string();
-    add_kernel_string(sources, norm_s);
-    const std::string& custom_s = get_custom_string();
-    add_kernel_string(sources, custom_s);
-    const std::string& spmv_blocked_s = get_spmv_blocked_string();
-    add_kernel_string(sources, spmv_blocked_s);
+    const std::string& axpy_s = get_axpy_source();
+    add_kernel_source(sources, axpy_s);
+    const std::string& scale_s = get_scale_source();
+    add_kernel_source(sources, scale_s);
+    const std::string& vmul_s = get_vmul_source();
+    add_kernel_source(sources, vmul_s);
+    const std::string& dot_1_s = get_dot_1_source();
+    add_kernel_source(sources, dot_1_s);
+    const std::string& norm_s = get_norm_source();
+    add_kernel_source(sources, norm_s);
+    const std::string& custom_s = get_custom_source();
+    add_kernel_source(sources, custom_s);
+    const std::string& full_to_pressure_restriction_s = get_full_to_pressure_restriction_source();
+    add_kernel_source(sources, full_to_pressure_restriction_s);
+    const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source();
+    add_kernel_source(sources, add_coarse_pressure_correction_s);
+    const std::string& prolongate_vector_s = get_prolongate_vector_source();
+    add_kernel_source(sources, prolongate_vector_s);
+    const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op);
+    add_kernel_source(sources, spmv_blocked_s);
+    const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true);
+    add_kernel_source(sources, spmv_s);
+    const std::string& spmv_noreset_s = get_matrix_operation_source(matrix_operation::spmv_op, false);
+    add_kernel_source(sources, spmv_noreset_s);
+    const std::string& residual_blocked_s = get_blocked_matrix_operation_source(matrix_operation::residual_op);
+    add_kernel_source(sources, residual_blocked_s);
+    const std::string& residual_s = get_matrix_operation_source(matrix_operation::residual_op);
+    add_kernel_source(sources, residual_s);
 #if CHOW_PATEL
    bool ilu_operate_on_full_matrix = false;
 #else
    bool ilu_operate_on_full_matrix = true;
 #endif
-    const std::string& ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
-    add_kernel_string(sources, ILU_apply1_s);
-    const std::string& ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
-    add_kernel_string(sources, ILU_apply2_s);
-    const std::string& stdwell_apply_s = get_stdwell_apply_string(true);
-    add_kernel_string(sources, stdwell_apply_s);
-    const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
-    add_kernel_string(sources, stdwell_apply_no_reorder_s);
-    const std::string& ilu_decomp_s = get_ilu_decomp_string();
-    add_kernel_string(sources, ilu_decomp_s);
+    const std::string& ILU_apply1_s = get_ILU_apply1_source(ilu_operate_on_full_matrix);
+    add_kernel_source(sources, ILU_apply1_s);
+    const std::string& ILU_apply2_s = get_ILU_apply2_source(ilu_operate_on_full_matrix);
+    add_kernel_source(sources, ILU_apply2_s);
+    const std::string& stdwell_apply_s = get_stdwell_apply_source(true);
+    add_kernel_source(sources, stdwell_apply_s);
+    const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_source(false);
+    add_kernel_source(sources, stdwell_apply_no_reorder_s);
+    const std::string& ilu_decomp_s = get_ilu_decomp_source();
+    add_kernel_source(sources, ilu_decomp_s);

    cl::Program program = cl::Program(*context, sources);
    program.build(devices);
@@ -114,8 +139,16 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
    axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
    scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
+    vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
    custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
-    spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
+    full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
+    add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
+    prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
+    spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
+    spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
+    spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
+    residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked")));
+    residual_k.reset(new residual_kernel_type(cl::Kernel(program, "residual")));
    ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
    ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
    stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
@@ -219,6 +252,23 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
    }
 }

+void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t_vmul;
+
+    cl::Event event = (*vmul_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), alpha, in1, in2, out, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels vmul() time: " << t_vmul.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
 void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
 {
    const unsigned int work_group_size = 32;
@@ -236,15 +286,75 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const do
    }
 }

-void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
+void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t;
+
+    cl::Event event = (*full_to_pressure_restriction_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), fine_y, weights, coarse_y, Nb);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels full_to_pressure_restriction() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t;
+
+    cl::Event event = (*add_coarse_pressure_correction_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), coarse_x, fine_x, pressure_idx, Nb);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t;
+
+    cl::Event event = (*prolongate_vector_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, cols, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels prolongate_vector() time: " << t.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
    const unsigned int total_work_items = num_work_groups * work_group_size;
    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
    Timer t_spmv;
+    cl::Event event;

-    cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+    if (block_size > 1) {
+        event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+    } else {
+        if (reset) {
+            event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
+        } else {
+            event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
+        }
+    }

    if (verbosity >= 4) {
        event.wait();
@@ -254,6 +364,29 @@ void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer&
    }
 }

+void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_residual;
+    cl::Event event;
+
+    if (block_size > 1) {
+        event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
+    } else {
+        event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
+    }
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels residual_blocked() time: " << t_residual.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+

 void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
 {
@@ -356,7 +489,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
 }


-    std::string OpenclKernels::get_axpy_string() {
+    std::string OpenclKernels::get_axpy_source() {
        return R"(
        __kernel void axpy(
            __global double *in,
@@ -377,7 +510,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe


    // scale vector with scalar
-    std::string OpenclKernels::get_scale_string() {
+    std::string OpenclKernels::get_scale_source() {
        return R"(
        __kernel void scale(
            __global double *vec,
@@ -395,9 +528,31 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
        )";
    }

+    // multiply vector with another vector and a scalar, element-wise
+    // add result to a third vector
+    std::string OpenclKernels::get_vmul_source() {
+        return R"(
+        __kernel void vmul(
+            const double alpha,
+            __global double const *in1,
+            __global double const *in2,
+            __global double *out,
+            const int N)
+        {
+            unsigned int NUM_THREADS = get_global_size(0);
+            int idx = get_global_id(0);
+
+            while(idx < N){
+                out[idx] += alpha * in1[idx] * in2[idx];
+                idx += NUM_THREADS;
+            }
+        }
+        )";
+    }
+

    // returns partial sums, instead of the final dot product
-    std::string OpenclKernels::get_dot_1_string() {
+    std::string OpenclKernels::get_dot_1_source() {
        return R"(
        __kernel void dot_1(
            __global double *in1,
@@ -440,7 +595,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe

    // returns partial sums, instead of the final norm
    // the square root must be computed on CPU
-    std::string OpenclKernels::get_norm_string() {
+    std::string OpenclKernels::get_norm_source() {
        return R"(
        __kernel void norm(
            __global double *in,
@@ -481,7 +636,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe


    // p = (p - omega * v) * beta + r
-    std::string OpenclKernels::get_custom_string() {
+    std::string OpenclKernels::get_custom_source() {
        return R"(
        __kernel void custom(
            __global double *p,
@@ -506,15 +661,98 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
        )";
    }

-    std::string OpenclKernels::get_spmv_blocked_string() {
+
+    // transform blocked vector to scalar vector using pressure-weights
+    // every workitem handles one blockrow
+    std::string OpenclKernels::get_full_to_pressure_restriction_source() {
        return R"(
-        __kernel void spmv_blocked(
-            __global const double *vals,
+        __kernel void full_to_pressure_restriction(
+            __global const double *fine_y,
+            __global const double *weights,
+            __global double *coarse_y,
+            const unsigned int Nb)
+        {
+            const unsigned int NUM_THREADS = get_global_size(0);
+            const unsigned int block_size = 3;
+            unsigned int target_block_row = get_global_id(0);
+
+            while(target_block_row < Nb){
+                double sum = 0.0;
+                unsigned int idx = block_size * target_block_row;
+                for (unsigned int i = 0; i < block_size; ++i) {
+                    sum += fine_y[idx + i] * weights[idx + i];
+                }
+                coarse_y[target_block_row] = sum;
+                target_block_row += NUM_THREADS;
+            }
+        }
+        )";
+    }
+
+    // add the coarse pressure solution back to the finer, complete solution
+    // every workitem handles one blockrow
+    std::string OpenclKernels::get_add_coarse_pressure_correction_source() {
+        return R"(
+        __kernel void add_coarse_pressure_correction(
+            __global const double *coarse_x,
+            __global double *fine_x,
+            const unsigned int pressure_idx,
+            const unsigned int Nb)
+        {
+            const unsigned int NUM_THREADS = get_global_size(0);
+            const unsigned int block_size = 3;
+            unsigned int target_block_row = get_global_id(0);
+
+            while(target_block_row < Nb){
+                fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
+                target_block_row += NUM_THREADS;
+            }
+        }
+        )";
+    }
+
+    // prolongate vector during amg cycle
+    // every workitem handles one row
+    std::string OpenclKernels::get_prolongate_vector_source() {
+        return R"(
+        __kernel void prolongate_vector(
+            __global const double *in,
+            __global double *out,
+            __global const int *cols,
+            const unsigned int N)
+        {
+            const unsigned int NUM_THREADS = get_global_size(0);
+            unsigned int row = get_global_id(0);
+
+            while(row < N){
+                out[row] += in[cols[row]];
+                row += NUM_THREADS;
+            }
+        }
+        )";
+    }
+
+
+/// either b = mat * x
+/// or res = rhs - mat * x
+std::string OpenclKernels::get_blocked_matrix_operation_source(matrix_operation op) {
+    std::string s;
+    if (op == matrix_operation::spmv_op) {
+        s += "__kernel void spmv_blocked(";
+    } else {
+        s += "__kernel void residual_blocked(";
+    }
+        s += R"(__global const double *vals,
            __global const int *cols,
            __global const int *rows,
            const int Nb,
            __global const double *x,
-            __global double *b,
+            )";
+    if (op == matrix_operation::residual_op) {
+        s += "__global const double *rhs,";
+    }
+        s += R"(
+            __global double *out,
            const unsigned int block_size,
            __local double *tmp)
        {
@@ -566,17 +804,99 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe

                if(lane < bs){
                    unsigned int row = target_block_row*bs + lane;
-                    b[row] = tmp[lane];
+              )";
+    if (op == matrix_operation::spmv_op) {
+        s += "      out[row] = tmp[lane];";
+    } else {
+        s += "      out[row] = rhs[row] - tmp[lane];";
+    }
+        s += R"(
                }
                target_block_row += num_warps_in_grid;
            }
        }
-        )";
+    )";
+    return s;
+}
+
+
+/// either b = mat * x
+/// or res = rhs - mat * x
+std::string OpenclKernels::get_matrix_operation_source(matrix_operation op, bool spmv_reset) {
+    std::string s;
+    if (op == matrix_operation::spmv_op) {
+        if (spmv_reset) {
+            s += "__kernel void spmv(";
+        } else {
+            s += "__kernel void spmv_noreset(";
+        }
+    } else {
+        s += "__kernel void residual(";
    }
+    s += R"(__global const double *vals,
+            __global const int *cols,
+            __global const int *rows,
+            const int N,
+            __global const double *x,
+            )";
+    if (op == matrix_operation::residual_op) {
+        s += "__global const double *rhs,";
+    }
+        s += R"(
+            __global double *out,
+            __local double *tmp)
+        {
+            const unsigned int bsize = get_local_size(0);
+            const unsigned int idx_b = get_global_id(0) / bsize;
+            const unsigned int idx_t = get_local_id(0);
+            const unsigned int num_workgroups = get_num_groups(0);
+
+            int row = idx_b;
+
+            while (row < N) {
+                int rowStart = rows[row];
+                int rowEnd = rows[row+1];
+                int rowLength = rowEnd - rowStart;
+                double local_sum = 0.0;
+                for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+                    int col = cols[j];
+                    local_sum += vals[j] * x[col];
+                }
+
+                tmp[idx_t] = local_sum;
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                int offset = bsize / 2;
+                while(offset > 0) {
+                    if (idx_t < offset) {
+                        tmp[idx_t] += tmp[idx_t + offset];
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    offset = offset / 2;
+                }
+
+                if (idx_t == 0) {
+              )";
+    if (op == matrix_operation::spmv_op) {
+        if (spmv_reset) {
+            s += "  out[row] = tmp[idx_t];";
+        } else {
+            s += "  out[row] += tmp[idx_t];";
+        }
+    } else {
+        s += "      out[row] = rhs[row] - tmp[idx_t];";
+    }
+        s += R"(
+                }
+                row += num_workgroups;
+            }
+        }
+        )";
+    return s;
+}


-
-    std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
+    std::string OpenclKernels::get_ILU_apply1_source(bool full_matrix) {
        std::string s = R"(
            __kernel void ILU_apply1(
                __global const double *LUvals,
@@ -652,7 +972,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
    }


-    std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
+    std::string OpenclKernels::get_ILU_apply2_source(bool full_matrix) {
        std::string s = R"(
            __kernel void ILU_apply2(
                __global const double *LUvals,
@@ -735,7 +1055,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
        return s;
    }

-    std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
+    std::string OpenclKernels::get_stdwell_apply_source(bool reorder) {
        std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
        std::string s = "__kernel void " + kernel_name + R"((
                        __global const double *Cnnzs,
@@ -830,7 +1150,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
    }


-    std::string OpenclKernels::get_ilu_decomp_string() {
+    std::string OpenclKernels::get_ilu_decomp_source() {
        return R"(

        // a = a - (b * c)
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -30,8 +30,14 @@ namespace Opm
 namespace Accelerator
 {

-using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
+using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
                                         cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
+using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
+                                         cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
+using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
+                                         cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
+using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
+                                         cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
 using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
 using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
@@ -55,12 +61,25 @@ private:
    static std::vector<double> tmp;     // used as tmp CPU buffer for dot() and norm()
    static bool initialized;

+    enum matrix_operation {
+        spmv_op,
+        residual_op
+    };
+
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
+    static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
-    static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
+    static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
+    static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
+    static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
+    static std::unique_ptr<spmv_kernel_type> spmv_k;
+    static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
+    static std::unique_ptr<residual_blocked_kernel_type> residual_blocked_k;
+    static std::unique_ptr<residual_kernel_type> residual_k;
    static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
    static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
    static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
@@ -69,54 +88,70 @@ private:

    /// Generate string with axpy kernel
    /// a = a + alpha * b
-    static std::string get_axpy_string();
+    static std::string get_axpy_source();

    /// Generate string with scale kernel
    /// a = a * alpha
-    static std::string get_scale_string();
+    static std::string get_scale_source();
+
+    /// multiply vector with another vector and a scalar, element-wise
+    /// add result to a third vector
+    static std::string get_vmul_source();

    /// returns partial sums, instead of the final dot product
    /// partial sums are added on CPU
-    static std::string get_dot_1_string();
+    static std::string get_dot_1_source();

    /// returns partial sums, instead of the final norm
    /// the square root must be computed on CPU
-    static std::string get_norm_string();
+    static std::string get_norm_source();

    /// Generate string with custom kernel
    /// This kernel combines some ilubicgstab vector operations into 1
    /// p = (p - omega * v) * beta + r
-    static std::string get_custom_string();
+    static std::string get_custom_source();
+
+    /// Transform blocked vector to scalar vector using pressure-weights
+    static std::string get_full_to_pressure_restriction_source();
+
+    /// Add the coarse pressure solution back to the finer, complete solution
+    static std::string get_add_coarse_pressure_correction_source();
+
+    /// Prolongate a vector during the AMG cycle
+    static std::string get_prolongate_vector_source();

    /// b = mat * x
    /// algorithm based on:
    /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
    /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
-    static std::string get_spmv_blocked_string();
+    /// or
+    /// res = rhs - (mat * x)
+    static std::string get_blocked_matrix_operation_source(matrix_operation op);
+    static std::string get_matrix_operation_source(matrix_operation op, bool spmv_reset = true);

    /// ILU apply part 1: forward substitution
    /// solves L*x=y where L is a lower triangular sparse blocked matrix
    /// this L can be it's own BSR matrix (if full_matrix is false),
    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
-    static std::string get_ILU_apply1_string(bool full_matrix);
+    static std::string get_ILU_apply1_source(bool full_matrix);

    /// ILU apply part 2: backward substitution
    /// solves U*x=y where U is an upper triangular sparse blocked matrix
    /// this U can be it's own BSR matrix (if full_matrix is false),
    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
-    static std::string get_ILU_apply2_string(bool full_matrix);
+    static std::string get_ILU_apply2_source(bool full_matrix);

    /// Generate string with the stdwell_apply kernels
    /// If reorder is true, the B/Ccols do not correspond with the x/y vector
    /// the x/y vector is reordered, use toOrder to address that
    /// \param[in] reorder   whether the matrix is reordered or not
-    static std::string get_stdwell_apply_string(bool reorder);
+    static std::string get_stdwell_apply_source(bool reorder);

    /// Generate string with the exact ilu decomposition kernel
    /// The kernel takes a full BSR matrix and performs inplace ILU decomposition
-    static std::string get_ilu_decomp_string();
+    static std::string get_ilu_decomp_source();

    OpenclKernels(){}; // disable instantiation

@@ -127,8 +162,14 @@ public:
    static double norm(cl::Buffer& in, cl::Buffer& out, int N);
    static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
    static void scale(cl::Buffer& in, const double a, int N);
+    static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
    static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
-    static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
+    static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
+    static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
+    static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
+    static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
+    static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
+
    static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
        const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);

--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -45,8 +45,25 @@ using Opm::OpmLog;
 using Dune::Timer;

 template <unsigned int block_size>
-openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
-    prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
+openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
+
+    bool use_cpr;
+    if (linsolver.compare("ilu0") == 0) {
+        use_cpr = false;
+    } else if (linsolver.compare("cpr_quasiimpes") == 0) {
+        use_cpr = true;
+    } else if (linsolver.compare("cpr_trueimpes") == 0) {
+        OPM_THROW(std::logic_error, "Error openclSolver does not support --linsolver=cpr_trueimpes");
+    } else {
+        OPM_THROW(std::logic_error, "Error unknown value for argument --linsolver, " + linsolver);
+    }
+
+    using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType;
+    if (use_cpr) {
+        prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_reorder);
+    } else {
+        prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_reorder);
+    }

    std::ostringstream out;
    try {
@@ -191,6 +208,19 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
    }
 }

+template <unsigned int block_size>
+openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder_) :
+    BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_reorder(opencl_ilu_reorder_)
+{
+    // prec = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
+    // cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_reorder, /*use_amg=*/false);
+}
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+    context = context_;
+    queue = queue_;
+}

 template <unsigned int block_size>
 openclSolverBackend<block_size>::~openclSolverBackend() {
@@ -257,7 +287,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // v = A * pw
        t_spmv.start();
-        OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
+        OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
        t_spmv.stop();

        // apply wellContributions
@@ -288,7 +318,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // t = A * s
        t_spmv.start();
-        OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
+        OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
        t_spmv.stop();

        // apply wellContributions
@@ -332,7 +362,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    }
    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "openclSolver::ilu_apply:   " << t_prec.elapsed() << " s\n";
+        out << "openclSolver::prec_apply:  " << t_prec.elapsed() << " s\n";
        out << "wellContributions::apply:  " << t_well.elapsed() << " s\n";
        out << "openclSolver::spmv:        " << t_spmv.elapsed() << " s\n";
        out << "openclSolver::rest:        " << t_rest.elapsed() << " s\n";
@@ -358,13 +388,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
    out.clear();

    try {
-        prec->setOpenCLContext(context.get());
-        prec->setOpenCLQueue(queue.get());
+        prec->setOpencl(context, queue);

 #if COPY_ROW_BY_ROW
-        vals_contiguous = new double[N];
+        vals_contiguous.resize(nnz);
 #endif
-        mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
+        mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));

        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
@@ -408,10 +437,6 @@ void openclSolverBackend<block_size>::finalize() {
    if (opencl_ilu_reorder != ILUReorder::NONE) {
        delete[] rb;
    }
-#if COPY_ROW_BY_ROW
-    delete[] vals_contiguous;
-#endif
-    delete prec;
 } // end finalize()

 template <unsigned int block_size>
@@ -423,10 +448,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
-        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
 #else
    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif
@@ -463,10 +488,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
-        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
 #else
    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif
@@ -489,29 +514,34 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {


 template <unsigned int block_size>
-bool openclSolverBackend<block_size>::analyse_matrix() {
+bool openclSolverBackend<block_size>::analyze_matrix() {
    Timer t;

-    bool success = prec->init(mat.get());
+    // bool success = bilu0->init(mat.get());
+    bool success = prec->analyze_matrix(mat.get());

    if (opencl_ilu_reorder == ILUReorder::NONE) {
        rmat = mat.get();
    } else {
+        // toOrder = bilu0->getToOrder();
+        // fromOrder = bilu0->getFromOrder();
+        // rmat = bilu0->getRMat();
        toOrder = prec->getToOrder();
        fromOrder = prec->getFromOrder();
        rmat = prec->getRMat();
    }

+
    if (verbosity > 2) {
        std::ostringstream out;
-        out << "openclSolver::analyse_matrix(): " << t.stop() << " s";
+        out << "openclSolver::analyze_matrix(): " << t.stop() << " s";
        OpmLog::info(out.str());
    }

    analysis_done = true;

    return success;
-} // end analyse_matrix()
+} // end analyze_matrix()


 template <unsigned int block_size>
@@ -603,7 +633,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
    if (initialized == false) {
        initialize(N_, nnz_,  dim, vals, rows, cols);
        if (analysis_done == false) {
-            if (!analyse_matrix()) {
+            if (!analyze_matrix()) {
                return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
            }
        }
@@ -624,8 +654,11 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
 }


-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                              \
-template openclSolverBackend<n>::openclSolverBackend(int, int, double, unsigned int, unsigned int, ILUReorder);   \
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
+template openclSolverBackend<n>::openclSolverBackend(                       \
+    int, int, double, unsigned int, unsigned int, ILUReorder, std::string); \
+template openclSolverBackend<n>::openclSolverBackend(int, int, double, ILUReorder); \
+template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -28,6 +28,8 @@
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 #include <opm/simulators/linalg/bda/BILU0.hpp>

+#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+
 #include <tuple>

 namespace Opm
@@ -35,12 +37,14 @@ namespace Opm
 namespace Accelerator
 {

+template <unsigned int block_size>
+class CPR;
+
 /// This class implements a opencl-based ilu0-bicgstab solver on GPU
 template <unsigned int block_size>
 class openclSolverBackend : public BdaSolver<block_size>
 {
    typedef BdaSolver<block_size> Base;
-    typedef BILU0<block_size> Preconditioner;

    using Base::N;
    using Base::Nb;
@@ -55,7 +59,7 @@ class openclSolverBackend : public BdaSolver<block_size>

 private:
    double *rb = nullptr;                 // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
-    double *vals_contiguous = nullptr;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
+    std::vector<double> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp

    // OpenCL variables must be reusable, they are initialized in initialize()
    cl::Buffer d_Avals, d_Acols, d_Arows;        // (reordered) matrix in BSR format on GPU
@@ -66,11 +70,13 @@ private:

    std::vector<cl::Device> devices;

-    Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
+    std::unique_ptr<Preconditioner<block_size> > prec;
+                                                                  // can perform blocked ILU0 and AMG on pressure component
+    bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
    bool analysis_done = false;
-    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
-    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix (or original if no reordering), used for spmv
+    std::unique_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    BlockedMatrix *rmat = nullptr;                                // reordered matrix (or original if no reordering), used for spmv
    ILUReorder opencl_ilu_reorder;                                // reordering strategy
    std::vector<cl::Event> events;
    cl_int err;
@@ -154,9 +160,9 @@ private:
    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    void update_system_on_gpu();

-    /// Analyse sparsity pattern to extract parallelism
+    /// Analyze sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
-    bool analyse_matrix();
+    bool analyze_matrix();

    /// Perform ilu0-decomposition
    /// \return true iff decomposition was successful
@@ -178,7 +184,13 @@ public:
    /// \param[in] platformID                 the OpenCL platform to be used
    /// \param[in] deviceID                   the device to be used
    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
-    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, ILUReorder opencl_ilu_reorder);
+    /// \param[in] linsolver                  indicating the preconditioner, equal to the --linsolver cmdline argument
+    ///                                       only ilu0 and cpr_quasiimpes are supported
+    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
+        ILUReorder opencl_ilu_reorder, std::string linsolver);
+
+    /// For the CPR coarse solver
+    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);

    /// Destroy a openclSolver, and free memory
    ~openclSolverBackend();
@@ -200,10 +212,20 @@ public:
    /// \return                   status code
    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;

+    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
+    /// Data is already on the GPU
+    // SolverStatus solve_system(BdaResult &res);
+
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
    void get_result(double *x) override;

+    /// Set OpenCL objects
+    /// This class either creates them based on platformID and deviceID or receives them through this function
+    /// \param[in] context   the opencl context to be used
+    /// \param[in] queue     the opencl queue to be used
+    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
+
 }; // end class openclSolverBackend

 } // namespace Accelerator
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@@ -76,15 +76,16 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
    const std::string opencl_ilu_reorder("none"); // unused
    const int platformID = 0;                     // unused
    const int deviceID = 0;
-    const std::string gpu_mode("cusparse");
+    const std::string accelerator_mode("cusparse");
    const std::string fpga_bitstream("empty");    // unused
+    const std::string linsolver("ilu0");
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
    auto wellContribs = Opm::WellContributions::create("cusparse", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
-        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
+        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);

        bridge->solve_system(&matrix, rhs, *wellContribs, result);
        bridge->get_result(x);
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@@ -75,15 +75,16 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
    const std::string opencl_ilu_reorder("none");
    const int platformID = 0;
    const int deviceID = 0;
-    const std::string gpu_mode("opencl");
+    const std::string accelerator_mode("opencl");
    const std::string fpga_bitstream("empty");    // unused
+    const std::string linsolver("ilu0");
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
    auto wellContribs = Opm::WellContributions::create("opencl", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
-        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
+        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);
    } catch (const std::logic_error& error) {
        BOOST_WARN_MESSAGE(true, error.what());
        throw PlatformInitException(error.what());
@@ -100,8 +101,8 @@ void test3(const pt::ptree& prm)
 {
    const int bz = 3;
    auto sol = testOpenclSolver<bz>(prm, "matr33.txt", "rhs3.txt");
-    Dune::BlockVector<Dune::FieldVector<double, bz>> expected {{-1.30307e-2, -3.58263e-6, 1.13836e-9},
-            {-1.25425e-3, -1.4167e-4, -3.2213e-3},
+    Dune::BlockVector<Dune::FieldVector<double, bz>> expected {{-0.0131626, -3.58263e-6, 1.13836e-9},
+            {-1.25425e-3, -1.4167e-4, -0.0029366},
                {-4.5436e-4, 1.28682e-5, 4.7644e-6}};
    BOOST_REQUIRE_EQUAL(sol.size(), expected.size());
    for (size_t i = 0; i < sol.size(); ++i) {
--- a/tests/test_solvetransposed3x3.cpp
+++ b/tests/test_solvetransposed3x3.cpp
@@ -0,0 +1,82 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#define BOOST_TEST_MODULE SolveTransposed3x3
+#include <boost/test/unit_test.hpp>
+#include <boost/test/floating_point_comparison.hpp>
+
+#include <dune/istl/bcrsmatrix.hh>
+
+#include <opm/simulators/linalg/bda/CPR.hpp>
+
+BOOST_AUTO_TEST_CASE(testsolvetransposed3x3)
+{
+    const unsigned numTests = 3;
+    const unsigned blockSize = 3;
+
+    std::vector<std::vector<double> > A = {{4, 2, 1,
+                                            3, 4, 2,
+                                            2, 4, 3},
+                                           {1, 2, 4,
+                                            1, 3, 2,
+                                            2, 4, 2},
+                                           {1, 2, 2,
+                                            1, 3, 5,
+                                            3, 2, 4}};
+
+    std::vector<std::vector<double> > b = {{0, 1, 0},
+                                           {1, 3, 5},
+                                           {2, 4, 5}};
+
+    std::vector<std::vector<double> > x_expected = {{-0.5,   1, -0.5},
+                                                    {   1,   1, -0.5},
+                                                    { 1.3, 0.4,  0.1}};
+
+    for (unsigned testId = 0; testId < numTests; ++testId) {
+        std::vector<double> x(blockSize);
+        Opm::Accelerator::solve_transposed_3x3(A[testId].data(), b[testId].data(), x.data());
+
+        for (unsigned i = 0; i < blockSize; ++i) {
+            BOOST_CHECK_CLOSE(x[i], x_expected[testId][i], 1e-6);
+        }
+    }
+
+    // retest cases using Dune methods
+    using Mat3 = Dune::FieldMatrix<double, 3, 3>;
+    using Vec3 = Dune::FieldVector<double, 3>;
+    for (unsigned testId = 0; testId < numTests; ++testId) {
+        Mat3 a3 = {{ A[testId][0], A[testId][1], A[testId][2] },
+                   { A[testId][3], A[testId][4], A[testId][5] },
+                   { A[testId][6], A[testId][7], A[testId][8] } };
+        Vec3 y({b[testId][0], b[testId][1], b[testId][2]});
+        Mat3 b3 = a3;
+
+        // b3 = inv(a3^T)
+        Dune::FMatrixHelp::invertMatrix_retTransposed(a3, b3);
+
+        // x = b3 * y
+        Vec3 x = Dune::FMatrixHelp::mult(b3, y);
+
+        for (unsigned i = 0; i < blockSize; ++i) {
+            BOOST_CHECK_CLOSE(x[i], x_expected[testId][i], 1e-6);
+        }
+    }
+}