Merge pull request #3619 from Tongdongq/opencl-refactor

Opencl refactor
2025-02-25 18:55:30 -06:00 · 2021-11-08 16:45:57 +01:00 · 2021-11-08 16:45:57 +01:00 · ad44564f27
commit ad44564f27
parent 8240bd9d76 803a2ac2f4
33 changed files with 1124 additions and 829 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -26,6 +26,9 @@ option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
 option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
 option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
 option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
+option(USE_CHOW_PATEL_ILU "Use the iterative ILU by Chow and Patel?" OFF)
+option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires USE_CHOW_PATEL_ILU" OFF)
+option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU_PARALLEL" OFF)

 if(SIBLING_SEARCH AND NOT opm-common_DIR)
  # guess the sibling dir
@ -211,6 +214,24 @@ if(OpenCL_FOUND)
    set(OpenCL_FOUND OFF)
    set(OPENCL_FOUND OFF)
  endif()
+  if(USE_CHOW_PATEL_ILU)
+    add_compile_options(-DCHOW_PATEL=1)
+    if(USE_CHOW_PATEL_ILU_GPU)
+      add_compile_options(-DCHOW_PATEL_GPU=1)
+      if(USE_CHOW_PATEL_ILU_GPU_PARALLEL)
+        add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=1)
+      else()
+        add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
+      endif()
+    else()
+      add_compile_options(-DCHOW_PATEL_GPU=0)
+      add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
+    endif()
+  endif()
+else()
+  if(USE_CHOW_PATEL_ILU)
+    message(FATAL_ERROR " CHOW_PATEL_ILU only works for openclSolver, but OpenCL was not found")
+  endif()
 endif()

 find_package(amgcl)
@ -242,6 +263,7 @@ if(OpenCL_FOUND)
  endif()
 endif()

+
 # read the list of components from this file (in the project directory);
 # it should set various lists with the names of the files to include
 include (CMakeLists_files.cmake)
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -18,10 +18,10 @@
 */

 #include <config.h>
+#include <algorithm>

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/MatrixBlock.hpp>
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
@ -30,7 +30,9 @@
 #include <opm/simulators/linalg/bda/Reorder.hpp>


-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -39,7 +41,11 @@ using Dune::Timer;
 template <unsigned int block_size>
 BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
    verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
-{}
+{
+#if CHOW_PATEL
+    chowPatelIlu.setVerbosity(verbosity);
+#endif
+}

 template <unsigned int block_size>
 BILU0<block_size>::~BILU0()
@ -158,350 +164,6 @@ BILU0<block_size>::~BILU0()
 } // end init()


-    // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
-    template <unsigned int block_size>
-    void BILU0<block_size>::chow_patel_decomposition()
-    {
-#if CHOW_PATEL
-        const unsigned int bs = block_size;
-        int num_sweeps = 6;
-
-        // split matrix into L and U
-        // also convert U into BSC format (Ut)
-        // Ut stores diagonal for now
-        // original matrix LUmat is assumed to be symmetric
-
-#ifndef NDEBUG
-        // verify that matrix is symmetric
-        for (int i = 0; i < Nb; ++i){
-            int iRowStart = LUmat->rowPointers[i];
-            int iRowEnd = LUmat->rowPointers[i + 1];
-            // for every block (i, j) in this row, check if (j, i) also exists
-            for (int ij = iRowStart; ij < iRowEnd; ij++) {
-                int j = LUmat->colIndices[ij];
-                int jRowStart = LUmat->rowPointers[j];
-                int jRowEnd = LUmat->rowPointers[j + 1];
-                bool blockFound = false;
-                // check all blocks on row j
-                // binary search is possible
-                for (int ji = jRowStart; ji < jRowEnd; ji++) {
-                    int row = LUmat->colIndices[ji];
-                    if (i == row) {
-                        blockFound = true;
-                        break;
-                    }
-                }
-                if (false == blockFound) {
-                    OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()");
-                }
-            }
-        }
-#endif
-
-        Timer t_total, t_preprocessing;
-
-        // Ut is actually BSC format
-        std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
-
-        Lmat->rowPointers[0] = 0;
-        for (int i = 0; i < Nb+1; i++) {
-            Ut->rowPointers[i] = 0;
-        }
-
-        Opm::Detail::Inverter<bs> inverter;
-
-        // store inverted diagonal
-        for (int i = 0; i < Nb; i++) {
-            int iRowStart = LUmat->rowPointers[i];
-            int iRowEnd = LUmat->rowPointers[i + 1];
-            // for every block in this row
-            for (int ij = iRowStart; ij < iRowEnd; ij++) {
-                int j = LUmat->colIndices[ij];
-                if (i == j) {
-                    inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
-                }
-            }
-        }
-
-        // initialize initial guess for L: L_A * D
-        // L_A is strictly lower triangular part of A
-        // D is inv(diag(A))
-        int num_blocks_L = 0;
-        for (int i = 0; i < Nb; i++) {
-            int iRowStart = LUmat->rowPointers[i];
-            int iRowEnd = LUmat->rowPointers[i + 1];
-            // for every block in this row
-            for (int ij = iRowStart; ij < iRowEnd; ij++) {
-                int j = LUmat->colIndices[ij];
-                if (i <= j) {
-                    Ut->rowPointers[j+1]++;   // actually colPointers, now simply indicates how many blocks this col holds
-                } else {
-                    Lmat->colIndices[num_blocks_L] = j;
-                    // multiply block of L with corresponding diag block
-                    blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
-                    num_blocks_L++;
-                }
-            }
-            // TODO: copy all blocks for L at once, instead of copying each block individually
-            Lmat->rowPointers[i+1] = num_blocks_L;
-        }
-
-        // prefix sum to sum rowsizes into colpointers
-        std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
-
-        // initialize initial guess for U
-        for (int i = 0; i < Nb; i++) {
-            int iRowStart = LUmat->rowPointers[i];
-            int iRowEnd = LUmat->rowPointers[i + 1];
-            // for every block in this row
-            for (int ij = iRowStart; ij < iRowEnd; ij++) {
-                int j = LUmat->colIndices[ij];
-                if (i <= j){
-                    int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i
-                    Ut->colIndices[idx] = i;     // actually rowIndices
-                    memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
-                }
-            }
-        }
-
-        // rotate
-        // the Ut->rowPointers were increased in the last loop
-        // now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value
-        for (int i = Nb; i > 0; --i) {
-            Ut->rowPointers[i] = Ut->rowPointers[i-1];
-        }
-        Ut->rowPointers[0] = 0;
-
-
-        // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
-        // U will be reversed because it is used with backwards substitution, the last row is used first
-        // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
-        double *Utmp = new double[Ut->nnzbs * block_size * block_size];
-
-        if (verbosity >= 3) {
-            std::ostringstream out;
-            out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        // actual ILU decomposition
-        Timer t_decomposition;
-#if CHOW_PATEL_GPU
-        chowPatelIlu.decomposition(queue, context,
-                    Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
-                    Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
-                    LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
-                    Nb, num_sweeps, verbosity);
-#else
-        double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
-        for (int sweep = 0; sweep < num_sweeps; ++sweep) {
-
-            // algorithm
-            // for every block in A (LUmat):
-            //     if i > j:
-            //         Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
-            //     else:
-            //         Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
-
-            // for every row
-            for (int row = 0; row < Nb; row++) {
-                // update U
-                // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
-                int jColStart = Ut->rowPointers[row];
-                int jColEnd = Ut->rowPointers[row + 1];
-                int colU = row; // rename for clarity, next row in Ut means next col in U
-                // for every block in this row
-                for (int ij = jColStart; ij < jColEnd; ij++) {
-                    int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U
-                    // refine Uij element (or diagonal)
-                    int i1 = LUmat->rowPointers[rowU1];
-                    int i2 = LUmat->rowPointers[rowU1+1];
-
-                    // search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij
-                    // LUmat->nnzValues[kk] is block Aij
-                    auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU);
-                    assert(candidate != LUmat->colIndices + i2);
-                    auto kk = candidate - LUmat->colIndices;
-
-                    double aij[bs*bs];
-                    // copy block to Aij so operations can be done on it without affecting LUmat
-                    memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
-
-                    int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L
-                    // if row rowU1 is empty, skip row
-                    if (jk < Lmat->rowPointers[rowU1+1]) {
-                        int colL = Lmat->colIndices[jk];
-                        // only check until block U(i,j) is reached
-                        for (int k = jColStart; k < ij; ++k) {
-                            int rowU2 = Ut->colIndices[k];
-                            while (colL < rowU2) {
-                                ++jk; // check next block on row rowU1 of L
-                                colL = Lmat->colIndices[jk];
-                            }
-                            if (colL == rowU2) {
-                                // Aij -= (Lik * Ukj)
-                                blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
-                            }
-                        }
-                    }
-
-                    // Uij_new = Aij - sum
-                    memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
-                }
-
-                // update L
-                // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
-                int iRowStart = Lmat->rowPointers[row];
-                int iRowEnd = Lmat->rowPointers[row + 1];
-
-                for (int ij = iRowStart; ij < iRowEnd; ij++) {
-                    int j = Lmat->colIndices[ij];
-                    // refine Lij element
-                    // search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij
-                    // LUmat->nnzValues[kk] is block Aij
-                    int i1 = LUmat->rowPointers[row];
-                    int i2 = LUmat->rowPointers[row+1];
-
-                    auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j);
-                    assert(candidate != LUmat->colIndices + i2);
-                    auto kk = candidate - LUmat->colIndices;
-
-                    double aij[bs*bs];
-                    // copy block to Aij so operations can be done on it without affecting LUmat
-                    memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
-
-                    int jk = Ut->rowPointers[j];  // actually colPointers, jk points to col j in U
-                    int rowU = Ut->colIndices[jk];  // actually rowIndices, rowU is the row of block jk
-                    // only check until block L(i,j) is reached
-                    for (int k = iRowStart; k < ij; ++k) {
-                        int colL = Lmat->colIndices[k];
-                        while (rowU < colL) {
-                            ++jk; // check next block on col j of U
-                            rowU = Ut->colIndices[jk];
-                        }
-
-                        if (rowU == colL) {
-                            // Aij -= (Lik * Ukj)
-                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
-                        }
-                    }
-
-                    // calculate (Aij - sum) / Ujj
-                    double ujj[bs*bs];
-                    inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
-                    // Lij_new = (Aij - sum) / Ujj
-                    blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
-                }
-            }
-            // 1st sweep writes to Ltmp
-            // 2nd sweep writes to Lmat->nnzValues
-            std::swap(Lmat->nnzValues, Ltmp);
-            std::swap(Ut->nnzValues, Utmp);
-        } // end sweep
-
-        // if number of sweeps is even, swap again so data is in Lmat->nnzValues
-        if (num_sweeps % 2 == 0) {
-            std::swap(Lmat->nnzValues, Ltmp);
-            std::swap(Ut->nnzValues, Utmp);
-        }
-        delete[] Ltmp;
-#endif
-
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        Timer t_postprocessing;
-
-        // convert Ut to BSR
-        // diagonal stored separately
-        std::vector<int> ptr(Nb+1, 0);
-        std::vector<int> cols(Ut->rowPointers[Nb]);
-
-        // count blocks per row for U (BSR)
-        // store diagonal in invDiagVals
-        for(int i = 0; i < Nb; ++i) {
-            for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
-                int j = Ut->colIndices[k];
-                if (j == i) {
-                    inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
-                } else {
-                    ++ptr[j+1];
-                }
-            }
-        }
-
-        // prefix sum
-        std::partial_sum(ptr.begin(), ptr.end(), ptr.begin());
-
-        // actually copy nonzero values for U
-        for(int i = 0; i < Nb; ++i) {
-            for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
-                int j = Ut->colIndices[k];
-                if (j != i) {
-                    int head = ptr[j]++;
-                    cols[head]  = i;
-                    memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
-                }
-            }
-        }
-
-        // the ptr[] were increased in the last loop
-        std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
-        ptr.front() = 0;
-
-
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        Timer t_copyToGpu;
-
-        events.resize(3);
-        queue->enqueueWriteBuffer(s.Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]);
-        queue->enqueueWriteBuffer(s.Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]);
-        queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]);
-
-        std::call_once(pattern_uploaded, [&](){
-            // find the positions of each diagonal block
-            // must be done after reordering
-            for (int row = 0; row < Nb; ++row) {
-                int rowStart = LUmat->rowPointers[row];
-                int rowEnd = LUmat->rowPointers[row+1];
-
-                auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
-                assert(candidate != LUmat->colIndices + rowEnd);
-                diagIndex[row] = candidate - LUmat->colIndices;
-            }
-            events.resize(8);
-            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
-            queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
-            queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
-            queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
-            queue->enqueueWriteBuffer(s.Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]);
-        });
-
-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
-        }
-
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n";
-            out << "BILU0 ChowPatel total: " << t_total.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        delete[] Utmp;
-#endif // CHOW_PATEL
-    }

    template <unsigned int block_size>
    bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
@ -533,7 +195,12 @@ BILU0<block_size>::~BILU0()
        }

 #if CHOW_PATEL
-        chow_patel_decomposition();
+        chowPatelIlu.decomposition(queue, context,
+            LUmat.get(), Lmat.get(), Umat.get(),
+            invDiagVals, diagIndex,
+            s.diagIndex, s.invDiagVals,
+            s.Lvals, s.Lcols, s.Lrows,
+            s.Uvals, s.Ucols, s.Urows);
 #else
        Timer t_copyToGpu;

@ -576,16 +243,10 @@ BILU0<block_size>::~BILU0()
        for (int color = 0; color < numColors; ++color) {
            const unsigned int firstRow = rowsPerColorPrefix[color];
            const unsigned int lastRow = rowsPerColorPrefix[color+1];
-            const unsigned int work_group_size2 = 128;
-            const unsigned int num_work_groups2 = 1024;
-            const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
-            const unsigned int num_hwarps_per_group = work_group_size2 / 16;
-            const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double);           // each block needs a pivot
            if (verbosity >= 4) {
                out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
            }
-            event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
-            event.wait();
+            OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
        }

        if (verbosity >= 3) {
@ -601,7 +262,7 @@ BILU0<block_size>::~BILU0()
    // however, if individual kernel calls are timed, waiting for events is needed
    // behavior on other GPUs is untested
    template <unsigned int block_size>
-    void BILU0<block_size>::apply(cl::Buffer& y, cl::Buffer& x)
+    void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
    {
        const double relaxation = 0.9;
        cl::Event event;
@ -609,28 +270,24 @@ BILU0<block_size>::~BILU0()

        for(int color = 0; color < numColors; ++color){
 #if CHOW_PATEL
-            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
 #else
-            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
 #endif
-            // event.wait();
        }

        for(int color = numColors-1; color >= 0; --color){
 #if CHOW_PATEL
-            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
 #else
-            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
 #endif
-            // event.wait();
        }

        // apply relaxation
-        event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N);
-        event.wait();
+        OpenclKernels::scale(x, relaxation, N);

        if (verbosity >= 4) {
-            event.wait();
            std::ostringstream out;
            out << "BILU0 apply: " << t_apply.stop() << " s";
            OpmLog::info(out.str());
@ -646,42 +303,17 @@ template <unsigned int block_size>
 void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
    this->queue = queue_;
 }
-template <unsigned int block_size>
-void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) {
-    this->work_group_size = work_group_size_;
-    this->total_work_items = total_work_items_;
-    this->lmem_per_work_group = lmem_per_work_group_;
-}
-template <unsigned int block_size>
-void BILU0<block_size>::setKernels(
-    cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
-    cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
-    cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale_,
-    cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_
-){
-    this->ILU_apply1 = ILU_apply1_;
-    this->ILU_apply2 = ILU_apply2_;
-    this->scale = scale_;
-    this->ilu_decomp = ilu_decomp_;
-}


-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                     \
-template BILU0<n>::BILU0(ILUReorder, int);                                               \
-template BILU0<n>::~BILU0();                                                             \
-template bool BILU0<n>::init(BlockedMatrix<n>*);                                         \
-template void BILU0<n>::chow_patel_decomposition();                                      \
-template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*);                        \
-template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y);                             \
-template void BILU0<n>::setOpenCLContext(cl::Context*);                                  \
-template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);                               \
-template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int);   \
-template void BILU0<n>::setKernels(                                                      \
-    cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
-    cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
-    cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *, \
-    cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
-    );
+#define INSTANTIATE_BDA_FUNCTIONS(n)                              \
+template BILU0<n>::BILU0(ILUReorder, int);                        \
+template BILU0<n>::~BILU0();                                      \
+template bool BILU0<n>::init(BlockedMatrix<n>*);                  \
+template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
+template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&);    \
+template void BILU0<n>::setOpenCLContext(cl::Context*);           \
+template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
+

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
@ -692,4 +324,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -29,19 +29,10 @@
 #include <opm/simulators/linalg/bda/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>

-// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
-// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
-//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
-// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
-// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
-// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
-// the exact decomposition uses a full matrix LU which is the superposition of L and U
-// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
-#define CHOW_PATEL     0
-#define CHOW_PATEL_GPU 1

-
-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

    /// This class implementa a Blocked ILU0 preconditioner
@ -83,25 +74,15 @@ namespace bda
 #endif
        } GPU_storage;

-        ilu_apply1_kernel_type *ILU_apply1;
-        ilu_apply2_kernel_type *ILU_apply2;
-        cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale;
-        cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                          cl::Buffer&, cl::Buffer&,
-                          const int, cl::LocalSpaceArg> *ilu_decomp;
-
        GPU_storage s;
        cl::Context *context;
        cl::CommandQueue *queue;
        std::vector<cl::Event> events;
        cl_int err;
-        int work_group_size = 0;
-        int total_work_items = 0;
-        int lmem_per_work_group = 0;

-        ChowPatelIlu chowPatelIlu;
-
-        void chow_patel_decomposition();
+#if CHOW_PATEL
+        ChowPatelIlu<block_size> chowPatelIlu;
+#endif

    public:

@ -116,17 +97,10 @@ namespace bda
        bool create_preconditioner(BlockedMatrix<block_size> *mat);

        // apply preconditioner, x = prec(y)
-        void apply(cl::Buffer& y, cl::Buffer& x);
+        void apply(const cl::Buffer& y, cl::Buffer& x);

        void setOpenCLContext(cl::Context *context);
        void setOpenCLQueue(cl::CommandQueue *queue);
-        void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
-        void setKernels(
-            cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
-            cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
-            cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale,
-            cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
-            );

        int* getToOrder()
        {
@ -145,7 +119,8 @@ namespace bda

    };

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif

--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -47,10 +47,10 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
 namespace Opm
 {

-    using bda::BdaResult;
-    using bda::BdaSolver;
-    using bda::SolverStatus;
-    using bda::ILUReorder;
+    using Opm::Accelerator::BdaResult;
+    using Opm::Accelerator::BdaSolver;
+    using Opm::Accelerator::SolverStatus;
+    using Opm::Accelerator::ILUReorder;

 template <class BridgeMatrix, class BridgeVector, int block_size>
 BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
@ -65,7 +65,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
-        backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
+        backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
@ -74,17 +74,17 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
        use_gpu = true;
        ILUReorder ilu_reorder;
        if (opencl_ilu_reorder == "") {
-            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;  // default when not selected by user
+            ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;  // default when not selected by user
        } else if (opencl_ilu_reorder == "level_scheduling") {
-            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
+            ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;
        } else if (opencl_ilu_reorder == "graph_coloring") {
-            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
+            ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;
        } else if (opencl_ilu_reorder == "none") {
-            ilu_reorder = bda::ILUReorder::NONE;
+            ilu_reorder = Opm::Accelerator::ILUReorder::NONE;
        } else {
            OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
        }
-        backend.reset(new bda::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
+        backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
@ -93,22 +93,22 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
        use_fpga = true;
        ILUReorder ilu_reorder;
        if (opencl_ilu_reorder == "") {
-            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;  // default when not selected by user
+            ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;  // default when not selected by user
        } else if (opencl_ilu_reorder == "level_scheduling") {
-            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
+            ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;
        } else if (opencl_ilu_reorder == "graph_coloring") {
-            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
+            ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;
        } else {
            OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
        }
-        backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
+        backend.reset(new Opm::Accelerator::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
 #else
        OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
 #endif
    } else if (accelerator_mode.compare("amgcl") == 0) {
 #if HAVE_AMGCL
        use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        backend.reset(new bda::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
+        backend.reset(new Opm::Accelerator::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
 #else
        OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake");
 #endif
@ -269,7 +269,7 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs) {
    if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
-        const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
+        const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
        wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -38,7 +38,7 @@ namespace Opm
 {

 typedef Dune::InverseOperatorResult InverseOperatorResult;
-using bda::ILUReorder;
+using Opm::Accelerator::ILUReorder;

 /// BdaBridge acts as interface between opm-simulators with the BdaSolvers
 template <class BridgeMatrix, class BridgeVector, int block_size>
@ -49,7 +49,7 @@ private:
    bool use_gpu = false;
    bool use_fpga = false;
    std::string accelerator_mode;
-    std::unique_ptr<bda::BdaSolver<block_size> > backend;
+    std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;

 public:
    /// Construct a BdaBridge
--- a/opm/simulators/linalg/bda/BdaResult.hpp
+++ b/opm/simulators/linalg/bda/BdaResult.hpp
@ -20,7 +20,9 @@
 #ifndef BDARESULT_HEADER_INCLUDED
 #define BDARESULT_HEADER_INCLUDED

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
@ -39,6 +41,7 @@ public:

 }; // end class BdaResult

-}
+} // namespace Accelerator
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -24,7 +24,9 @@
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

    using Opm::WellContributions;
@ -91,6 +93,7 @@ namespace bda

    }; // end class BdaSolver

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -28,7 +28,9 @@
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -504,5 +506,5 @@ INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
 #undef INSTANTIATE_BDA_FPGA_FUNCTIONS
 #endif // HAVE_FPGA

-
-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -26,7 +26,9 @@

 #include <opm/simulators/linalg/bda/FPGAMatrix.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
@ -164,6 +166,7 @@ void blockVectMult(double *mat, double *vect, double scale, double *resVect, boo
 void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
 #endif

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/ChowPatelIlu.cpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
@ -18,18 +18,25 @@
 */

 #include <config.h>
+
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <dune/common/timer.hh>
+#include <opm/simulators/linalg/MatrixBlock.hpp>

 #include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>

-namespace bda
+#if CHOW_PATEL
+
+namespace Opm
+{
+namespace Accelerator
 {

-    using Opm::OpmLog;
+using Opm::OpmLog;
+using Dune::Timer;

-// if PARALLEL is 0:
+// if CHOW_PATEL_GPU_PARALLEL is 0:
 //    Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
 //    Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
 //    except when the number of blocks in that row exceeds the number of workitems per workgroup.
@ -41,15 +48,13 @@ namespace bda
 //    If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.

 // Notes:
-// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
-// PARALLEL 1 should be run with at least 32 workitems per workgroup.
+// CHOW_PATEL_GPU_PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
+// CHOW_PATEL_GPU_PARALLEL 1 should be run with at least 32 workitems per workgroup.
 // The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
-// PARALLEL 0 is generally faster, despite not having parallelization.
+// CHOW_PATEL_GPU_PARALLEL 0 is generally faster, despite not having parallelization.
 // only 3x3 blocks are supported

-#define PARALLEL 0
-
-#if PARALLEL
+#if CHOW_PATEL_GPU_PARALLEL
 inline const char* chow_patel_ilu_sweep_s  = R"(

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@ -265,7 +270,7 @@ __kernel void chow_patel_ilu_sweep(
 }
 )";

-#else
+#else // CHOW_PATEL_GPU_PARALLEL

 inline const char* chow_patel_ilu_sweep_s  = R"(

@ -468,22 +473,378 @@ __kernel void chow_patel_ilu_sweep(
 }
 )";

+#endif // CHOW_PATEL_GPU_PARALLEL
+
+
+
+
+// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
+template <unsigned int block_size>
+void ChowPatelIlu<block_size>::decomposition(
+    cl::CommandQueue *queue, [[maybe_unused]] cl::Context *context,
+    BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
+    double *invDiagVals, std::vector<int>& diagIndex,
+    cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
+    cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
+    cl::Buffer& d_Uvals, cl::Buffer& d_Ucols, cl::Buffer& d_Urows)
+{
+    if (block_size != 3) {
+        OPM_THROW(std::logic_error, "ChowPatelIlu::decomposition only supports block_size = 3");
+    }
+
+    const unsigned int bs = block_size;
+    const int Nb = LUmat->Nb;
+    const int nnzbs = LUmat->nnzbs;
+    int num_sweeps = 6;
+
+    // split matrix into L and U
+    // also convert U into BSC format (Ut)
+    // Ut stores diagonal for now
+    // original matrix LUmat is assumed to be symmetric
+
+#ifndef NDEBUG
+    // verify that matrix is symmetric
+    for (int i = 0; i < Nb; ++i){
+        int iRowStart = LUmat->rowPointers[i];
+        int iRowEnd = LUmat->rowPointers[i + 1];
+        // for every block (i, j) in this row, check if (j, i) also exists
+        for (int ij = iRowStart; ij < iRowEnd; ij++) {
+            int j = LUmat->colIndices[ij];
+            int jRowStart = LUmat->rowPointers[j];
+            int jRowEnd = LUmat->rowPointers[j + 1];
+            bool blockFound = false;
+            // check all blocks on row j
+            // binary search is possible
+            for (int ji = jRowStart; ji < jRowEnd; ji++) {
+                int row = LUmat->colIndices[ji];
+                if (i == row) {
+                    blockFound = true;
+                    break;
+                }
+            }
+            if (false == blockFound) {
+                OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()");
+            }
+        }
+    }
 #endif

+    Timer t_total, t_preprocessing;
+
+    // Ut is actually BSC format
+    std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
+
+    Lmat->rowPointers[0] = 0;
+    for (int i = 0; i < Nb+1; i++) {
+        Ut->rowPointers[i] = 0;
+    }
+
+    Opm::Detail::Inverter<bs> inverter;
+
+    // store inverted diagonal
+    for (int i = 0; i < Nb; i++) {
+        int iRowStart = LUmat->rowPointers[i];
+        int iRowEnd = LUmat->rowPointers[i + 1];
+        // for every block in this row
+        for (int ij = iRowStart; ij < iRowEnd; ij++) {
+            int j = LUmat->colIndices[ij];
+            if (i == j) {
+                inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
+            }
+        }
+    }
+
+    // initialize initial guess for L: L_A * D
+    // L_A is strictly lower triangular part of A
+    // D is inv(diag(A))
+    int num_blocks_L = 0;
+    for (int i = 0; i < Nb; i++) {
+        int iRowStart = LUmat->rowPointers[i];
+        int iRowEnd = LUmat->rowPointers[i + 1];
+        // for every block in this row
+        for (int ij = iRowStart; ij < iRowEnd; ij++) {
+            int j = LUmat->colIndices[ij];
+            if (i <= j) {
+                Ut->rowPointers[j+1]++;   // actually colPointers, now simply indicates how many blocks this col holds
+            } else {
+                Lmat->colIndices[num_blocks_L] = j;
+                // multiply block of L with corresponding diag block
+                blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
+                num_blocks_L++;
+            }
+        }
+        // TODO: copy all blocks for L at once, instead of copying each block individually
+        Lmat->rowPointers[i+1] = num_blocks_L;
+    }
+
+    // prefix sum to sum rowsizes into colpointers
+    std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
+
+    // initialize initial guess for U
+    for (int i = 0; i < Nb; i++) {
+        int iRowStart = LUmat->rowPointers[i];
+        int iRowEnd = LUmat->rowPointers[i + 1];
+        // for every block in this row
+        for (int ij = iRowStart; ij < iRowEnd; ij++) {
+            int j = LUmat->colIndices[ij];
+            if (i <= j){
+                int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i
+                Ut->colIndices[idx] = i;     // actually rowIndices
+                memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
+            }
+        }
+    }
+
+    // rotate
+    // the Ut->rowPointers were increased in the last loop
+    // now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value
+    for (int i = Nb; i > 0; --i) {
+        Ut->rowPointers[i] = Ut->rowPointers[i-1];
+    }
+    Ut->rowPointers[0] = 0;


+    // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
+    // U will be reversed because it is used with backwards substitution, the last row is used first
+    // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
+    double *Utmp = new double[Ut->nnzbs * block_size * block_size];
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    // actual ILU decomposition
+    Timer t_decomposition;
+#if CHOW_PATEL_GPU
+    gpu_decomposition(queue, context,
+                Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
+                Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
+                LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
+                Nb, num_sweeps);
+#else
+    double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
+    for (int sweep = 0; sweep < num_sweeps; ++sweep) {
+
+        // algorithm
+        // for every block in A (LUmat):
+        //     if i > j:
+        //         Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
+        //     else:
+        //         Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
+
+        // for every row
+        for (int row = 0; row < Nb; row++) {
+            // update U
+            // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
+            int jColStart = Ut->rowPointers[row];
+            int jColEnd = Ut->rowPointers[row + 1];
+            int colU = row; // rename for clarity, next row in Ut means next col in U
+            // for every block in this row
+            for (int ij = jColStart; ij < jColEnd; ij++) {
+                int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U
+                // refine Uij element (or diagonal)
+                int i1 = LUmat->rowPointers[rowU1];
+                int i2 = LUmat->rowPointers[rowU1+1];
+
+                // search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij
+                // LUmat->nnzValues[kk] is block Aij
+                auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU);
+                assert(candidate != LUmat->colIndices + i2);
+                auto kk = candidate - LUmat->colIndices;
+
+                double aij[bs*bs];
+                // copy block to Aij so operations can be done on it without affecting LUmat
+                memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
+
+                int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L
+                // if row rowU1 is empty, skip row
+                if (jk < Lmat->rowPointers[rowU1+1]) {
+                    int colL = Lmat->colIndices[jk];
+                    // only check until block U(i,j) is reached
+                    for (int k = jColStart; k < ij; ++k) {
+                        int rowU2 = Ut->colIndices[k];
+                        while (colL < rowU2) {
+                            ++jk; // check next block on row rowU1 of L
+                            colL = Lmat->colIndices[jk];
+                        }
+                        if (colL == rowU2) {
+                            // Aij -= (Lik * Ukj)
+                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
+                        }
+                    }
+                }
+
+                // Uij_new = Aij - sum
+                memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
+            }
+
+            // update L
+            // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
+            int iRowStart = Lmat->rowPointers[row];
+            int iRowEnd = Lmat->rowPointers[row + 1];
+
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = Lmat->colIndices[ij];
+                // refine Lij element
+                // search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij
+                // LUmat->nnzValues[kk] is block Aij
+                int i1 = LUmat->rowPointers[row];
+                int i2 = LUmat->rowPointers[row+1];
+
+                auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j);
+                assert(candidate != LUmat->colIndices + i2);
+                auto kk = candidate - LUmat->colIndices;
+
+                double aij[bs*bs];
+                // copy block to Aij so operations can be done on it without affecting LUmat
+                memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
+
+                int jk = Ut->rowPointers[j];  // actually colPointers, jk points to col j in U
+                int rowU = Ut->colIndices[jk];  // actually rowIndices, rowU is the row of block jk
+                // only check until block L(i,j) is reached
+                for (int k = iRowStart; k < ij; ++k) {
+                    int colL = Lmat->colIndices[k];
+                    while (rowU < colL) {
+                        ++jk; // check next block on col j of U
+                        rowU = Ut->colIndices[jk];
+                    }
+
+                    if (rowU == colL) {
+                        // Aij -= (Lik * Ukj)
+                        blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
+                    }
+                }
+
+                // calculate (Aij - sum) / Ujj
+                double ujj[bs*bs];
+                inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
+                // Lij_new = (Aij - sum) / Ujj
+                blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
+            }
+        }
+        // 1st sweep writes to Ltmp
+        // 2nd sweep writes to Lmat->nnzValues
+        std::swap(Lmat->nnzValues, Ltmp);
+        std::swap(Ut->nnzValues, Utmp);
+    } // end sweep
+
+    // if number of sweeps is even, swap again so data is in Lmat->nnzValues
+    if (num_sweeps % 2 == 0) {
+        std::swap(Lmat->nnzValues, Ltmp);
+        std::swap(Ut->nnzValues, Utmp);
+    }
+    delete[] Ltmp;
+#endif
+
+    if (verbosity >= 3){
+        std::ostringstream out;
+        out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Timer t_postprocessing;
+
+    // convert Ut to BSR
+    // diagonal stored separately
+    std::vector<int> ptr(Nb+1, 0);
+    std::vector<int> cols(Ut->rowPointers[Nb]);
+
+    // count blocks per row for U (BSR)
+    // store diagonal in invDiagVals
+    for(int i = 0; i < Nb; ++i) {
+        for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
+            int j = Ut->colIndices[k];
+            if (j == i) {
+                inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
+            } else {
+                ++ptr[j+1];
+            }
+        }
+    }
+
+    // prefix sum
+    std::partial_sum(ptr.begin(), ptr.end(), ptr.begin());
+
+    // actually copy nonzero values for U
+    for(int i = 0; i < Nb; ++i) {
+        for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
+            int j = Ut->colIndices[k];
+            if (j != i) {
+                int head = ptr[j]++;
+                cols[head]  = i;
+                memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
+            }
+        }
+    }
+
+    // the ptr[] were increased in the last loop
+    std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
+    ptr.front() = 0;


+    if (verbosity >= 3){
+        std::ostringstream out;
+        out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Timer t_copyToGpu;
+
+    events.resize(3);
+    err = queue->enqueueWriteBuffer(d_Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]);
+    err |= queue->enqueueWriteBuffer(d_Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(d_invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]);
+
+    std::call_once(pattern_uploaded, [&](){
+        // find the positions of each diagonal block
+        // must be done after reordering
+        for (int row = 0; row < Nb; ++row) {
+            int rowStart = LUmat->rowPointers[row];
+            int rowEnd = LUmat->rowPointers[row+1];
+
+            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+            assert(candidate != LUmat->colIndices + rowEnd);
+            diagIndex[row] = candidate - LUmat->colIndices;
+        }
+        events.resize(8);
+        err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
+        err |= queue->enqueueWriteBuffer(d_Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
+        err |= queue->enqueueWriteBuffer(d_Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
+        err |= queue->enqueueWriteBuffer(d_Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
+        err |= queue->enqueueWriteBuffer(d_Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]);
+    });
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }
+
+    if (verbosity >= 3){
+        std::ostringstream out;
+        out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n";
+        out << "BILU0 ChowPatel total: " << t_total.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    delete[] Utmp;
+}


-void ChowPatelIlu::decomposition(
+template <unsigned int block_size>
+void ChowPatelIlu<block_size>::gpu_decomposition(
    cl::CommandQueue *queue, cl::Context *context,
    int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
    int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
    int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
-    int Nb, int num_sweeps, int verbosity)
+    int Nb, int num_sweeps)
 {
-    const int block_size = 3;
+    if (block_size != 3) {
+        OPM_THROW(std::logic_error, "ChowPatelIlu::gpu_decomposition only supports block_size = 3");
+    }

    try {
        // just put everything in the capture list
@ -535,7 +896,7 @@ void ChowPatelIlu::decomposition(
                OpmLog::info(out.str());
            }
            std::ostringstream out;
-            out << "ChowPatelIlu PARALLEL: " << PARALLEL;
+            out << "ChowPatelIlu PARALLEL: " << CHOW_PATEL_GPU_PARALLEL;
            OpmLog::info(out.str());
        });

@ -569,7 +930,7 @@ void ChowPatelIlu::decomposition(
            auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
            auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
            int num_work_groups = Nb;
-#if PARALLEL
+#if CHOW_PATEL_GPU_PARALLEL
            int work_group_size = 32;
 #else
            int work_group_size = 16;
@ -625,5 +986,26 @@ void ChowPatelIlu::decomposition(
 }


-} // end namespace bda
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                          \
+template void ChowPatelIlu<n>::decomposition(                                 \
+    cl::CommandQueue *queue, cl::Context *context,                            \
+    BlockedMatrix<n> *LUmat, BlockedMatrix<n> *Lmat, BlockedMatrix<n> *Umat,  \
+    double *invDiagVals, std::vector<int>& diagIndex,                         \
+    cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,                       \
+    cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,            \
+    cl::Buffer& d_Uvals, cl::Buffer& d_Ucols, cl::Buffer& d_Urows);
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+INSTANTIATE_BDA_FUNCTIONS(5);
+INSTANTIATE_BDA_FUNCTIONS(6);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace Accelerator
+} // namespace Opm
+
+#endif // CHOW_PATEL

--- a/opm/simulators/linalg/bda/ChowPatelIlu.hpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.hpp
@ -24,64 +24,106 @@
 #include <mutex>

 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

+// Variables CHOW_PATEL, CHOW_PATEL_GPU and CHOW_PATEL_GPU_PARALLEL are set by CMake
+// Pass -DUSE_CHOW_PATEL_ILU=1 to cmake to define CHOW_PATEL and use the iterative ILU decomposition
+// Pass -DUSE_CHOW_PATEL_ILU_GPU=1 to run the ILU decomposition sweeps on the GPU
+// Pass -DUSE_CHOW_PATEL_ILU_GPU_PARALLEL=1 to use more parallelisation in the GPU kernel, see ChowPatelIlu.cpp

-namespace bda
+// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
+// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
+//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
+// if CHOW_PATEL_GPU is 1, the decomposition is done by gpu_decomposition() on GPU
+// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
+// the exact decomposition uses a full matrix LU which is the superposition of L and U
+// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
+
+#if CHOW_PATEL
+
+namespace Opm
+{
+namespace Accelerator
 {

-    // This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
-    //     FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
-    // only blocksize == 3 is supported
-    // decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
-    class ChowPatelIlu
-    {
-    private:
-        cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
-        cl::Buffer d_Ut_ptrs, d_Ut_idxs;
-        cl::Buffer d_L_rows, d_L_cols;
-        cl::Buffer d_LU_rows, d_LU_cols;
-        cl::Buffer d_Ltmp, d_Utmp;

-        cl::Event event;
-        std::vector<cl::Event> events;
-        cl_int err;
-        std::once_flag initialize_flag;
+// This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
+//     FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+// only blocksize == 3 is supported
+// decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
+template <unsigned int block_size>
+class ChowPatelIlu
+{
+private:
+    cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
+    cl::Buffer d_Ut_ptrs, d_Ut_idxs;
+    cl::Buffer d_L_rows, d_L_cols;
+    cl::Buffer d_LU_rows, d_LU_cols;
+    cl::Buffer d_Ltmp, d_Utmp;

-        std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                        cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                        cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                        cl::Buffer&, cl::Buffer&,
-                                        const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k;
+    cl::Event event;
+    std::vector<cl::Event> events;
+    cl_int err;
+    std::once_flag initialize_flag;
+    std::once_flag pattern_uploaded;
+    int verbosity = 0;

-    public:
-        /// Executes the ChowPatelIlu sweeps
-        /// also copies data from CPU to GPU and GPU to CPU
-        /// \param[in] queue         OpenCL commandqueue
-        /// \param[in] context       OpenCL context
-        /// \param[in] Ut_ptrs       BSC columnpointers
-        /// \param[in] Ut_idxs       BSC rowindices
-        /// \param[inout] Ut_vals    actual nonzeros for U
-        /// \param[in] Ut_nnzbs      number of blocks in U
-        /// \param[in] L_rows        BSR rowpointers
-        /// \param[in] L_cols        BSR columnindices
-        /// \param[inout] L_vals     actual nonzeroes for L
-        /// \param[in] L_nnzbs       number of blocks in L
-        /// \param[in] LU_rows       BSR rowpointers
-        /// \param[in] LU_cols       BSR columnindices
-        /// \param[in] LU_vals       actual nonzeroes for LU (original matrix)
-        /// \param[in] LU_nnzbs      number of blocks in LU
-        /// \param[in] Nb            number of blockrows
-        /// \param[in] num_sweeps    number of sweeps to be done
-        /// \param[in] verbosity     print verbosity
-        void decomposition(
-            cl::CommandQueue *queue, cl::Context *context,
-            int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
-            int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
-            int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
-            int Nb, int num_sweeps, int verbosity);
+    std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    cl::Buffer&, cl::Buffer&,
+                                    const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k;

-    };
+public:
+    /// Transposes the U matrix
+    /// Executes the ChowPatelIlu sweeps for decomposition on CPU
+    /// Also uploads the decomposition to the GPU (and sparsity pattern if needed)
+    /// This function calls gpu_decomposition() if CHOW_PATEL_GPU is set
+    void decomposition(
+        cl::CommandQueue *queue, cl::Context *context,
+        BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
+        double *invDiagVals, std::vector<int>& diagIndex,
+        cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
+        cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
+        cl::Buffer& d_Uvals, cl::Buffer& d_Ucols, cl::Buffer& d_Urows);

-} // end namespace bda
+
+    /// Executes the ChowPatelIlu sweeps for decomposition on GPU
+    /// also copies data from CPU to GPU and GPU to CPU
+    /// \param[in] queue         OpenCL commandqueue
+    /// \param[in] context       OpenCL context
+    /// \param[in] Ut_ptrs       BSC columnpointers
+    /// \param[in] Ut_idxs       BSC rowindices
+    /// \param[inout] Ut_vals    actual nonzeros for U
+    /// \param[in] Ut_nnzbs      number of blocks in U
+    /// \param[in] L_rows        BSR rowpointers
+    /// \param[in] L_cols        BSR columnindices
+    /// \param[inout] L_vals     actual nonzeroes for L
+    /// \param[in] L_nnzbs       number of blocks in L
+    /// \param[in] LU_rows       BSR rowpointers
+    /// \param[in] LU_cols       BSR columnindices
+    /// \param[in] LU_vals       actual nonzeroes for LU (original matrix)
+    /// \param[in] LU_nnzbs      number of blocks in LU
+    /// \param[in] Nb            number of blockrows
+    /// \param[in] num_sweeps    number of sweeps to be done
+    void gpu_decomposition(
+        cl::CommandQueue *queue, cl::Context *context,
+        int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
+        int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
+        int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
+        int Nb, int num_sweeps);
+
+    /// Set the verbosity
+    void setVerbosity(int verbosity_) {
+        this->verbosity = verbosity_;
+    }
+
+};
+
+} // namespace Accelerator
+} // namespace Opm
+
+#endif // CHOW_PATEL

 #endif // CHOW_PATEL_ILU_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGABILU0.cpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.cpp
@ -29,7 +29,9 @@
 #include <opm/simulators/linalg/bda/Reorder.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -412,4 +414,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} //namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/FPGABILU0.hpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.hpp
@ -25,7 +25,9 @@
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /*
@ -112,6 +114,7 @@ public:

 };

-} //namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif // FPGA_BILU0_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGAMatrix.cpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.cpp
@ -23,7 +23,9 @@
 #include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /*Sort a row of matrix elements from a CSR-format.*/
@ -246,4 +248,5 @@ int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
    return 0;
 }

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/FPGAMatrix.hpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.hpp
@ -22,7 +22,9 @@

 #include <vector>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This struct resembles a csr matrix, only doubles are supported
@ -79,6 +81,7 @@ public:

 void sortRow(int *colIndices, double *data, int left, int right);

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif // FPGA_MATRIX_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@ -39,7 +39,9 @@
 //#define FPGA_STATISTICS_FILE_ENABLED
 #undef FPGA_STATISTICS_FILE_ENABLED

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -731,4 +733,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} //namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@ -27,7 +27,9 @@
 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This class implements an ilu0-bicgstab solver on FPGA
@ -259,7 +261,8 @@ public:

 }; // end class fpgaSolverBackend

-} //namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif

--- a/opm/simulators/linalg/bda/FPGAUtils.cpp
+++ b/opm/simulators/linalg/bda/FPGAUtils.cpp
@ -20,7 +20,9 @@
 #include <sys/time.h>
 #include <fstream>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 double second(void)
@ -60,4 +62,5 @@ bool fileExists(const char *filename)
    }
 }

-} //namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/FPGAUtils.hpp
+++ b/opm/simulators/linalg/bda/FPGAUtils.hpp
@ -20,7 +20,9 @@
 #ifndef FPGA_UTILS_HEADER_INCLUDED
 #define FPGA_UTILS_HEADER_INCLUDED

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 union double2int
@ -34,6 +36,7 @@ bool even(int n);
 int roundUpTo(int i, int n);
 bool fileExists(const char *filename);

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif // FPGA_UTILS_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/ILUReorder.hpp
+++ b/opm/simulators/linalg/bda/ILUReorder.hpp
@ -20,7 +20,9 @@
 #ifndef ILUREORDER_HEADER_INCLUDED
 #define ILUREORDER_HEADER_INCLUDED

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {
    // Level Scheduling respects the dependencies in the original matrix, and behaves like Dune and cusparse
    // Graph Coloring is more aggresive and is likely to increase the number of linearizations and linear iterations to converge significantly, but can still be faster on GPU because it results in more parallelism
@ -31,6 +33,7 @@ namespace bda
        NONE
    };

-}
+} // namespace Accelerator
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@ -45,7 +45,9 @@ namespace {

 }

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {


@ -381,4 +383,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} //namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@ -24,7 +24,9 @@

 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 #define MAX_COLORS 256
@ -121,6 +123,7 @@ void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, cons
 /// \param[in] Nb                number of blockrows in the matrix
 void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb);

-}
+} // namespace Accelerator
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -23,11 +23,14 @@
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>

+#include <opm/simulators/linalg/bda/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

 namespace Opm
 {

+using Opm::Accelerator::OpenclKernels;
+
 WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
    if(accelerator_mode.compare("cusparse") == 0){
        cuda_gpu = true;
@ -82,8 +85,8 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
    this->queue = queue_;
 }

-void WellContributions::setKernel(bda::stdwell_apply_kernel_type *kernel_,
-                                  bda::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
+void WellContributions::setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
+                                  Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
    this->kernel = kernel_;
    this->kernel_no_reorder = kernel_no_reorder_;
 }
@ -95,22 +98,13 @@ void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
 }

 void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
-    const unsigned int work_group_size = 32;
-    const unsigned int total_work_items = num_std_wells * work_group_size;
-    const unsigned int lmem1 = sizeof(double) * work_group_size;
-    const unsigned int lmem2 = sizeof(double) * dim_wells;
-
-    cl::Event event;
    if (reorder) {
-        event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
-                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+        OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
+            d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
    } else {
-        event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl,
-                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+        OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
+            d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
    }
-    event.wait();
 }

 void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -94,8 +94,8 @@ private:
 #if HAVE_OPENCL
    cl::Context *context;
    cl::CommandQueue *queue;
-    bda::stdwell_apply_kernel_type *kernel;
-    bda::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
+    Opm::Accelerator::stdwell_apply_kernel_type *kernel;
+    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
    std::vector<cl::Event> events;

    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
@ -147,8 +147,8 @@ public:
 #endif

 #if HAVE_OPENCL
-    void setKernel(bda::stdwell_apply_kernel_type *kernel_,
-                   bda::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
+    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
+                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);

    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@ -35,7 +35,9 @@
 #include <amgcl/backend/vexcl_static_matrix.hpp>
 #endif

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -381,4 +383,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} // namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cu
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cu
@ -28,7 +28,9 @@

 /// This file is only compiled when both amgcl and CUDA are found by CMake

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -87,5 +89,6 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} // namespace bda
+} // namespace Accelerator
+} // namespace Opm

--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@ -38,7 +38,9 @@
 #include <amgcl/preconditioner/runtime.hpp>
 #include <amgcl/value_type/static_matrix.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving
@ -149,7 +151,8 @@ public:

 }; // end class amgclSolverBackend

-} // namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif

--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -37,7 +37,9 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -511,6 +513,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} // namespace bda
-
-
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@ -28,7 +28,9 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
@ -142,7 +144,8 @@ public:

 }; // end class cusparseSolverBackend

-} // namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif

--- a/opm/simulators/linalg/bda/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/openclKernels.cpp
@ -18,12 +18,345 @@
 */

 #include <config.h>
-#include <opm/simulators/linalg/bda/openclKernels.hpp>
+#include <cmath>
+#include <sstream>

-namespace bda
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>  // defines CHOW_PATEL
+
+namespace Opm
+{
+namespace Accelerator
 {

-    std::string get_axpy_string() {
+using Opm::OpmLog;
+using Dune::Timer;
+
+// define static variables and kernels
+int OpenclKernels::verbosity;
+cl::CommandQueue *OpenclKernels::queue;
+std::vector<double> OpenclKernels::tmp;
+bool OpenclKernels::initialized = false;
+
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
+std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
+std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
+std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
+std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
+std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
+std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
+
+
+// divide A by B, and round up: return (int)ceil(A/B)
+unsigned int ceilDivision(const unsigned int A, const unsigned int B)
+{
+    return A / B + (A % B > 0);
+}
+
+void add_kernel_string(cl::Program::Sources &sources, const std::string &source) {
+    sources.emplace_back(source);
+}
+
+void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) {
+
+    if (initialized) {
+        OpmLog::debug("Warning OpenclKernels is already initialized");
+        return;
+    }
+
+    queue = queue_;
+    verbosity = verbosity_;
+
+    cl::Program::Sources sources;
+    const std::string& axpy_s = get_axpy_string();
+    add_kernel_string(sources, axpy_s);
+    const std::string& scale_s = get_scale_string();
+    add_kernel_string(sources, scale_s);
+    const std::string& dot_1_s = get_dot_1_string();
+    add_kernel_string(sources, dot_1_s);
+    const std::string& norm_s = get_norm_string();
+    add_kernel_string(sources, norm_s);
+    const std::string& custom_s = get_custom_string();
+    add_kernel_string(sources, custom_s);
+    const std::string& spmv_blocked_s = get_spmv_blocked_string();
+    add_kernel_string(sources, spmv_blocked_s);
+#if CHOW_PATEL
+    bool ilu_operate_on_full_matrix = false;
+#else
+    bool ilu_operate_on_full_matrix = true;
+#endif
+    const std::string& ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
+    add_kernel_string(sources, ILU_apply1_s);
+    const std::string& ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
+    add_kernel_string(sources, ILU_apply2_s);
+    const std::string& stdwell_apply_s = get_stdwell_apply_string(true);
+    add_kernel_string(sources, stdwell_apply_s);
+    const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
+    add_kernel_string(sources, stdwell_apply_no_reorder_s);
+    const std::string& ilu_decomp_s = get_ilu_decomp_string();
+    add_kernel_string(sources, ilu_decomp_s);
+
+    cl::Program program = cl::Program(*context, sources);
+    program.build(devices);
+
+    // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
+    // cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
+
+    // actually creating the kernels
+    dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
+    norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
+    axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
+    scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
+    custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
+    spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
+    ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
+    ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
+    stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
+    stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
+    ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
+
+    initialized = true;
+} // end get_opencl_kernels()
+
+
+
+
+double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
+{
+    const unsigned int work_group_size = 256;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_dot;
+    tmp.resize(num_work_groups);
+
+    cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
+
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
+
+    double gpu_sum = 0.0;
+    for (unsigned int i = 0; i < num_work_groups; ++i) {
+        gpu_sum += tmp[i];
+    }
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels dot() time: " << t_dot.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+
+    return gpu_sum;
+}
+
+double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
+{
+    const unsigned int work_group_size = 256;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_norm;
+    tmp.resize(num_work_groups);
+
+    cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
+
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
+
+    double gpu_norm = 0.0;
+    for (unsigned int i = 0; i < num_work_groups; ++i) {
+        gpu_norm += tmp[i];
+    }
+    gpu_norm = sqrt(gpu_norm);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels norm() time: " << t_norm.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+
+    return gpu_norm;
+}
+
+void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t_axpy;
+
+    cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels axpy() time: " << t_axpy.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t_scale;
+
+    cl::Event event = (*scale_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels scale() time: " << t_scale.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t_custom;
+
+    cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels custom() time: " << t_custom.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_spmv;
+
+    cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels spmv_blocked() time: " << t_spmv.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+
+void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_ilu_apply1;
+
+    cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, y, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+
+    if (verbosity >= 5) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels ILU_apply1() time: " << t_ilu_apply1.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+
+void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_ilu_apply2;
+
+    cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, invDiagVals, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+
+    if (verbosity >= 5) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels ILU_apply2() time: " << t_ilu_apply2.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size)
+{
+    const unsigned int work_group_size2 = 128;
+    const unsigned int num_work_groups2 = 1024;
+    const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
+    const unsigned int num_hwarps_per_group = work_group_size2 / 16;
+    const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double);           // each block needs a pivot
+    Timer t_ilu_decomp;
+
+    cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, vals, cols, rows, invDiagVals, diagIndex, Nb, cl::Local(lmem_per_work_group2));
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels ILU_decomp() time: " << t_ilu_decomp.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
+    cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
+    cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int total_work_items = num_std_wells * work_group_size;
+    const unsigned int lmem1 = sizeof(double) * work_group_size;
+    const unsigned int lmem2 = sizeof(double) * dim_wells;
+    Timer t_apply_stdwells;
+
+    cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                          d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, d_val_pointers_ocl,
+                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels apply_stdwells() time: " << t_apply_stdwells.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
+    cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
+    int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int total_work_items = num_std_wells * work_group_size;
+    const unsigned int lmem1 = sizeof(double) * work_group_size;
+    const unsigned int lmem2 = sizeof(double) * dim_wells;
+    Timer t_apply_stdwells;
+
+    cl::Event event = (*stdwell_apply_no_reorder_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                          d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
+                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels apply_stdwells() time: " << t_apply_stdwells.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+
+    std::string OpenclKernels::get_axpy_string() {
        return R"(
        __kernel void axpy(
            __global double *in,
@ -44,7 +377,7 @@ namespace bda


    // scale vector with scalar
-    std::string get_scale_string() {
+    std::string OpenclKernels::get_scale_string() {
        return R"(
        __kernel void scale(
            __global double *vec,
@ -64,7 +397,7 @@ namespace bda


    // returns partial sums, instead of the final dot product
-    std::string get_dot_1_string() {
+    std::string OpenclKernels::get_dot_1_string() {
        return R"(
        __kernel void dot_1(
            __global double *in1,
@ -107,7 +440,7 @@ namespace bda

    // returns partial sums, instead of the final norm
    // the square root must be computed on CPU
-    std::string get_norm_string() {
+    std::string OpenclKernels::get_norm_string() {
        return R"(
        __kernel void norm(
            __global double *in,
@ -148,7 +481,7 @@ namespace bda


    // p = (p - omega * v) * beta + r
-    std::string get_custom_string() {
+    std::string OpenclKernels::get_custom_string() {
        return R"(
        __kernel void custom(
            __global double *p,
@ -173,7 +506,7 @@ namespace bda
        )";
    }

-    std::string get_spmv_blocked_string() {
+    std::string OpenclKernels::get_spmv_blocked_string() {
        return R"(
        __kernel void spmv_blocked(
            __global const double *vals,
@ -243,7 +576,7 @@ namespace bda



-    std::string get_ILU_apply1_string(bool full_matrix) {
+    std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
        std::string s = R"(
            __kernel void ILU_apply1(
                __global const double *LUvals,
@ -319,7 +652,7 @@ namespace bda
    }


-    std::string get_ILU_apply2_string(bool full_matrix) {
+    std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
        std::string s = R"(
            __kernel void ILU_apply2(
                __global const double *LUvals,
@ -402,7 +735,7 @@ namespace bda
        return s;
    }

-    std::string get_stdwell_apply_string(bool reorder) {
+    std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
        std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
        std::string s = "__kernel void " + kernel_name + R"((
                        __global const double *Cnnzs,
@ -497,7 +830,7 @@ namespace bda
    }


-    std::string get_ilu_decomp_string() {
+    std::string OpenclKernels::get_ilu_decomp_string() {
        return R"(

        // a = a - (b * c)
@ -637,5 +970,6 @@ namespace bda
        )";
    }

-} // end namespace bda
+} // namespace Accelerator
+} // namespace Opm

--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@ -21,15 +21,18 @@
 #define OPENCL_HPP

 #include <string>
+#include <memory>

 #include <opm/simulators/linalg/bda/opencl.hpp>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
                                         cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
-using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
 using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
@ -44,57 +47,107 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
 using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;

+class OpenclKernels
+{
+private:
+    static int verbosity;
+    static cl::CommandQueue *queue;
+    static std::vector<double> tmp;     // used as tmp CPU buffer for dot() and norm()
+    static bool initialized;
+
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
+    static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
+    static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
+    static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
+    static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
+    static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
+    static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
+
    /// Generate string with axpy kernel
    /// a = a + alpha * b
-    std::string get_axpy_string();
+    static std::string get_axpy_string();

    /// Generate string with scale kernel
    /// a = a * alpha
-    std::string get_scale_string();
+    static std::string get_scale_string();

    /// returns partial sums, instead of the final dot product
    /// partial sums are added on CPU
-    std::string get_dot_1_string();
+    static std::string get_dot_1_string();

    /// returns partial sums, instead of the final norm
    /// the square root must be computed on CPU
-    std::string get_norm_string();
+    static std::string get_norm_string();

    /// Generate string with custom kernel
    /// This kernel combines some ilubicgstab vector operations into 1
    /// p = (p - omega * v) * beta + r
-    std::string get_custom_string();
+    static std::string get_custom_string();

    /// b = mat * x
    /// algorithm based on:
    /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
    /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
-    std::string get_spmv_blocked_string();
+    static std::string get_spmv_blocked_string();

    /// ILU apply part 1: forward substitution
    /// solves L*x=y where L is a lower triangular sparse blocked matrix
    /// this L can be it's own BSR matrix (if full_matrix is false),
    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
-    std::string get_ILU_apply1_string(bool full_matrix);
+    static std::string get_ILU_apply1_string(bool full_matrix);

    /// ILU apply part 2: backward substitution
    /// solves U*x=y where U is an upper triangular sparse blocked matrix
    /// this U can be it's own BSR matrix (if full_matrix is false),
    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
-    std::string get_ILU_apply2_string(bool full_matrix);
+    static std::string get_ILU_apply2_string(bool full_matrix);

    /// Generate string with the stdwell_apply kernels
    /// If reorder is true, the B/Ccols do not correspond with the x/y vector
    /// the x/y vector is reordered, use toOrder to address that
    /// \param[in] reorder   whether the matrix is reordered or not
-    std::string get_stdwell_apply_string(bool reorder);
+    static std::string get_stdwell_apply_string(bool reorder);

    /// Generate string with the exact ilu decomposition kernel
    /// The kernel takes a full BSR matrix and performs inplace ILU decomposition
-    std::string get_ilu_decomp_string();
+    static std::string get_ilu_decomp_string();

-} // end namespace bda
+    OpenclKernels(){}; // disable instantiation
+
+public:
+    static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
+
+    static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
+    static double norm(cl::Buffer& in, cl::Buffer& out, int N);
+    static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
+    static void scale(cl::Buffer& in, const double a, int N);
+    static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
+    static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
+    static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
+        const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
+
+    static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
+        cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
+
+    static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
+        cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
+
+    static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
+        cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
+        cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
+
+    static void apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
+        cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
+        int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
+};
+
+} // namespace Accelerator
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -35,7 +35,9 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 using Opm::OpmLog;
@ -174,6 +176,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
        context = std::make_shared<cl::Context>(devices[0]);
        queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));

+        OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
+
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -193,125 +197,6 @@ openclSolverBackend<block_size>::~openclSolverBackend() {
 }


-// divide A by B, and round up: return (int)ceil(A/B)
-template <unsigned int block_size>
-unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
-{
-    return A / B + (A % B > 0);
-}
-
-
-template <unsigned int block_size>
-double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
-{
-    const unsigned int work_group_size = 256;
-    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
-    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
-    Timer t_dot;
-
-    cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
-
-    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
-
-    double gpu_sum = 0.0;
-    for (unsigned int i = 0; i < num_work_groups; ++i) {
-        gpu_sum += tmp[i];
-    }
-
-    if (verbosity >= 4) {
-        event.wait();
-        std::ostringstream oss;
-        oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s";
-        OpmLog::info(oss.str());
-    }
-
-    return gpu_sum;
-}
-
-template <unsigned int block_size>
-double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
-{
-    const unsigned int work_group_size = 256;
-    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
-    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
-    Timer t_norm;
-
-    cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
-
-    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
-
-    double gpu_norm = 0.0;
-    for (unsigned int i = 0; i < num_work_groups; ++i) {
-        gpu_norm += tmp[i];
-    }
-    gpu_norm = sqrt(gpu_norm);
-
-    if (verbosity >= 4) {
-        event.wait();
-        std::ostringstream oss;
-        oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s";
-        OpmLog::info(oss.str());
-    }
-
-    return gpu_norm;
-}
-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
-{
-    const unsigned int work_group_size = 32;
-    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
-    const unsigned int total_work_items = num_work_groups * work_group_size;
-    Timer t_axpy;
-
-    cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
-
-    if (verbosity >= 4) {
-        event.wait();
-        std::ostringstream oss;
-        oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s";
-        OpmLog::info(oss.str());
-    }
-}
-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
-{
-    const unsigned int work_group_size = 32;
-    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
-    const unsigned int total_work_items = num_work_groups * work_group_size;
-    Timer t_custom;
-
-    cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
-
-    if (verbosity >= 4) {
-        event.wait();
-        std::ostringstream oss;
-        oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s";
-        OpmLog::info(oss.str());
-    }
-}
-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
-{
-    const unsigned int work_group_size = 32;
-    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
-    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
-    Timer t_spmv;
-
-    cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
-
-    if (verbosity >= 4) {
-        event.wait();
-        std::ostringstream oss;
-        oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s";
-        OpmLog::info(oss.str());
-    }
-}

 template <unsigned int block_size>
 void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
@ -319,10 +204,6 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
    double norm, norm_0;

-    if(wellContribs.getNumWells() > 0){
-        wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
-    }
-
    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);

    // set r to the initial residual
@ -348,7 +229,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
    }

-    norm = norm_w(d_r, d_tmp);
+    norm = OpenclKernels::norm(d_r, d_tmp, N);
    norm_0 = norm;

    if (verbosity > 1) {
@ -360,11 +241,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    t_rest.start();
    for (it = 0.5; it < maxit; it += 0.5) {
        rhop = rho;
-        rho = dot_w(d_rw, d_r, d_tmp);
+        rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);

        if (it > 1) {
            beta = (rho / rhop) * (alpha / omega);
-            custom_w(d_p, d_v, d_r, omega, beta);
+            OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
        }
        t_rest.stop();

@ -375,7 +256,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // v = A * pw
        t_spmv.start();
-        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
+        OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
        t_spmv.stop();

        // apply wellContributions
@ -386,11 +267,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        t_well.stop();

        t_rest.start();
-        tmp1 = dot_w(d_rw, d_v, d_tmp);
+        tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
        alpha = rho / tmp1;
-        axpy_w(d_v, -alpha, d_r);      // r = r - alpha * v
-        axpy_w(d_pw, alpha, d_x);      // x = x + alpha * pw
-        norm = norm_w(d_r, d_tmp);
+        OpenclKernels::axpy(d_v, -alpha, d_r, N);      // r = r - alpha * v
+        OpenclKernels::axpy(d_pw, alpha, d_x, N);      // x = x + alpha * pw
+        norm = OpenclKernels::norm(d_r, d_tmp, N);
        t_rest.stop();

        if (norm < tolerance * norm_0) {
@ -406,7 +287,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // t = A * s
        t_spmv.start();
-        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
+        OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
        t_spmv.stop();

        // apply wellContributions
@ -417,12 +298,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        t_well.stop();

        t_rest.start();
-        tmp1 = dot_w(d_t, d_r, d_tmp);
-        tmp2 = dot_w(d_t, d_t, d_tmp);
+        tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
+        tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
        omega = tmp1 / tmp2;
-        axpy_w(d_s, omega, d_x);     // x = x + omega * s
-        axpy_w(d_t, -omega, d_r);    // r = r - omega * t
-        norm = norm_w(d_r, d_tmp);
+        OpenclKernels::axpy(d_s, omega, d_x, N);     // x = x + omega * s
+        OpenclKernels::axpy(d_t, -omega, d_r, N);    // r = r - omega * t
+        norm = OpenclKernels::norm(d_r, d_tmp, N);
        t_rest.stop();

        if (norm < tolerance * norm_0) {
@ -479,7 +360,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        prec->setOpenCLContext(context.get());
        prec->setOpenCLQueue(queue.get());

-        tmp = new double[N];
 #if COPY_ROW_BY_ROW
        vals_contiguous = new double[N];
 #endif
@ -507,10 +387,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
            d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
        }

-        get_opencl_kernels();
-
-        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get());
-
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -525,68 +401,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
    initialized = true;
 } // end initialize()

-void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
-        sources.emplace_back(source);
-}
-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::get_opencl_kernels() {
-
-        cl::Program::Sources sources;
-        std::string axpy_s = get_axpy_string();
-        add_kernel_string(sources, axpy_s);
-        std::string scale_s = get_scale_string();
-        add_kernel_string(sources, scale_s);
-        std::string dot_1_s = get_dot_1_string();
-        add_kernel_string(sources, dot_1_s);
-        std::string norm_s = get_norm_string();
-        add_kernel_string(sources, norm_s);
-        std::string custom_s = get_custom_string();
-        add_kernel_string(sources, custom_s);
-        std::string spmv_blocked_s = get_spmv_blocked_string();
-        add_kernel_string(sources, spmv_blocked_s);
-#if CHOW_PATEL
-        bool ilu_operate_on_full_matrix = false;
-#else
-        bool ilu_operate_on_full_matrix = true;
-#endif
-        std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
-        add_kernel_string(sources, ILU_apply1_s);
-        std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
-        add_kernel_string(sources, ILU_apply2_s);
-        std::string stdwell_apply_s = get_stdwell_apply_string(true);
-        add_kernel_string(sources, stdwell_apply_s);
-        std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
-        add_kernel_string(sources, stdwell_apply_no_reorder_s);
-        std::string ilu_decomp_s = get_ilu_decomp_string();
-        add_kernel_string(sources, ilu_decomp_s);
-
-        cl::Program program = cl::Program(*context, sources);
-        program.build(devices);
-
-        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
-        // cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
-
-        // actually creating the kernels
-        dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
-        norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
-        axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
-        scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
-        custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
-        spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
-        ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
-        ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
-        stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
-        stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
-        ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
-} // end get_opencl_kernels()

 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
    if (opencl_ilu_reorder != ILUReorder::NONE) {
        delete[] rb;
    }
-    delete[] tmp;
 #if COPY_ROW_BY_ROW
    delete[] vals_contiguous;
 #endif
@ -672,11 +492,6 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
    Timer t;

    bool success = prec->init(mat.get());
-    int work_group_size = 32;
-    int num_work_groups = ceilDivision(N, work_group_size);
-    int total_work_items = num_work_groups * work_group_size;
-    int lmem_per_work_group = work_group_size * sizeof(double);
-    prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);

    if (opencl_ilu_reorder == ILUReorder::NONE) {
        rmat = mat.get();
@ -820,4 +635,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);

 #undef INSTANTIATE_BDA_FUNCTIONS

-} // namespace bda
+} // namespace Accelerator
+} // namespace Opm
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -30,7 +30,9 @@

 #include <tuple>

-namespace bda
+namespace Opm
+{
+namespace Accelerator
 {

 /// This class implements a opencl-based ilu0-bicgstab solver on GPU
@ -61,21 +63,8 @@ private:
    cl::Buffer d_pw, d_s, d_t, d_v;              // vectors, used during linear solve
    cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
    cl::Buffer d_toOrder;                        // only used when reordering is used
-    double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()

-    // shared pointers are also passed to other objects
    std::vector<cl::Device> devices;
-    std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
-    std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
-    std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
-    std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
-    std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
-    std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
-    std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
-    std::shared_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
-    std::shared_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
-    std::shared_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
-    std::shared_ptr<ilu_decomp_kernel_type> ilu_decomp_k;

    Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
@ -150,9 +139,6 @@ private:
    /// \param[in] cols           array of columnIndices, contains nnz values
    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);

-    /// Generate and compile opencl kernels
-    void get_opencl_kernels();
-
    /// Clean memory
    void finalize();

@ -220,7 +206,8 @@ public:

 }; // end class openclSolverBackend

-} // namespace bda
+} // namespace Accelerator
+} // namespace Opm

 #endif