diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 5259d65b4..500c31102 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -60,6 +60,7 @@ if(OPENCL_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp
index 4d4c19b9b..eb83b7d3f 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -250,12 +250,9 @@ namespace Opm
             if (use_gpu) {
                 const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
                 WellContributions wellContribs(gpu_mode);
-#if HAVE_OPENCL
-                if(gpu_mode.compare("opencl") == 0){
-                    const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(&bdaBridge->getBackend());
-                    wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
-                }
-#endif
+
+                bdaBridge->initWellContributions(wellContribs);
+
                 if (!useWellConn_) {
                     simulator_.problem().wellModel().getWellContributions(wellContribs);
                 }
diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index 991dc14b1..6dc037468 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -18,6 +18,7 @@
 */
 
 #include <config.h>
+
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/MatrixBlock.hpp>
@@ -32,14 +33,6 @@
 namespace bda
 {
 
-// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
-// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
-//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
-// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
-// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
-#define CHOW_PATEL     0
-#define CHOW_PATEL_GPU 1
-
     using Opm::OpmLog;
     using Dune::Timer;
 
@@ -126,41 +119,54 @@ namespace bda
         diagIndex = new int[mat->Nb];
         invDiagVals = new double[mat->Nb * bs * bs];
 
+#if CHOW_PATEL
         Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
         Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+#endif
+
+        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
 
-        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
-        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
-        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
-        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs);
-        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
-        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1));
         s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
         s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+        s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
+#if CHOW_PATEL
+        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+#else
+        s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+        s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
+        s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
+#endif
 
-        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
-        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
-        queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
-        queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
-        queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
-        queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
-        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals);
+        events.resize(2);
+        err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
 
-        int *rowsPerColorPrefix = new int[numColors + 1];
-        rowsPerColorPrefix[0] = 0;
+        rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
         for (int i = 0; i < numColors; ++i) {
             rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
         }
-        queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
-        delete[] rowsPerColorPrefix;
+        err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+        }
 
         return true;
     } // end init()
 
+
     // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
     template <unsigned int block_size>
     void BILU0<block_size>::chow_patel_decomposition()
     {
+#if CHOW_PATEL
         const unsigned int bs = block_size;
         int num_sweeps = 6;
 
@@ -196,6 +202,8 @@ namespace bda
         }
 #endif
 
+        Timer t_total, t_preprocessing;
+
         // Ut is actually BSC format
         std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
 
@@ -274,7 +282,14 @@ namespace bda
         // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
         double *Utmp = new double[Ut->nnzbs * block_size * block_size];
 
+        if (verbosity >= 3) {
+            std::ostringstream out;
+            out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
         // actual ILU decomposition
+        Timer t_decomposition;
 #if CHOW_PATEL_GPU
         chowPatelIlu.decomposition(queue, context,
                     Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
@@ -396,10 +411,18 @@ namespace bda
         delete[] Ltmp;
 #endif
 
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        Timer t_postprocessing;
+
         // convert Ut to BSR
         // diagonal stored separately
         std::vector<int> ptr(Nb+1, 0);
-        std::vector<int> col(Ut->rowPointers[Nb]);
+        std::vector<int> cols(Ut->rowPointers[Nb]);
 
         // count blocks per row for U (BSR)
         // store diagonal in invDiagVals
@@ -423,7 +446,7 @@ namespace bda
                 int j = Ut->colIndices[k];
                 if (j != i) {
                     int head = ptr[j]++;
-                    col[head]  = i;
+                    cols[head]  = i;
                     memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
                 }
             }
@@ -433,21 +456,55 @@ namespace bda
         std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
         ptr.front() = 0;
 
-        // reversing the rows of U, because that is the order they are used in during ILU apply
-        int URowIndex = 0;
-        int offsetU = 0;   // number of nnz blocks that are already copied to Umat
-        Umat->rowPointers[0] = 0;
-        for (int i = LUmat->Nb - 1; i >= 0; i--) {
-            int rowSize = ptr[i + 1] - ptr[i];   // number of blocks in this row
-            memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
-            memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
-            offsetU += rowSize;
-            Umat->rowPointers[URowIndex + 1] = offsetU;
-            URowIndex++;
+
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        Timer t_copyToGpu;
+
+        events.resize(3);
+        queue->enqueueWriteBuffer(s.Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(s.Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]);
+        queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]);
+
+        std::call_once(pattern_uploaded, [&](){
+            // find the positions of each diagonal block
+            // must be done after reordering
+            for (int row = 0; row < Nb; ++row) {
+                int rowStart = LUmat->rowPointers[row];
+                int rowEnd = LUmat->rowPointers[row+1];
+
+                auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+                assert(candidate != LUmat->colIndices + rowEnd);
+                diagIndex[row] = candidate - LUmat->colIndices;
+            }
+            events.resize(8);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]);
+            queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
+            queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
+            queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
+            queue->enqueueWriteBuffer(s.Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]);
+        });
+
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+        }
+
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n";
+            out << "BILU0 ChowPatel total: " << t_total.stop() << " s";
+            OpmLog::info(out.str());
         }
 
         delete[] Utmp;
-
+#endif // CHOW_PATEL
     }
 
     template <unsigned int block_size>
@@ -479,118 +536,68 @@ namespace bda
             OpmLog::info(out.str());
         }
 
-        Timer t_decomposition;
 #if CHOW_PATEL
         chow_patel_decomposition();
 #else
-        int i, j, ij, ik, jk;
-        int iRowStart, iRowEnd, jRowEnd;
-        double pivot[bs * bs];
-
-        int LSize = 0;
-        Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks
-
-        // go through all rows
-        for (i = 0; i < LUmat->Nb; i++) {
-            iRowStart = LUmat->rowPointers[i];
-            iRowEnd = LUmat->rowPointers[i + 1];
-
-            // go through all elements of the row
-            for (ij = iRowStart; ij < iRowEnd; ij++) {
-                j = LUmat->colIndices[ij];
-                // if the element is the diagonal, store the index and go to next row
-                if (j == i) {
-                    diagIndex[i] = ij;
-                    break;
-                }
-                // if an element beyond the diagonal is reach, no diagonal was found
-                // throw an error now. TODO: perform reordering earlier to prevent this
-                if (j > i) {
-                    std::ostringstream out;
-                    out << "BILU0 Error could not find diagonal value in row: " << i;
-                    OpmLog::error(out.str());
-                    return false;
-                }
-
-                LSize++;
-                // calculate the pivot of this row
-                blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
-
-                memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size);
-
-                jRowEnd = LUmat->rowPointers[j + 1];
-                jk = diagIndex[j] + 1;
-                ik = ij + 1;
-                // substract that row scaled by the pivot from this row.
-                while (ik < iRowEnd && jk < jRowEnd) {
-                    if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) {
-                        blockMultSub<bs>(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs);
-                        ik++;
-                        jk++;
-                    } else {
-                        if (LUmat->colIndices[ik] < LUmat->colIndices[jk])
-                        { ik++; }
-                        else
-                        { jk++; }
-                    }
-                }
-            }
-            // store the inverse in the diagonal!
-            inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
-
-            memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
-        }
-
-        Lmat->rowPointers[0] = 0;
-        Umat->rowPointers[0] = 0;
-
-        // Split the LU matrix into two by comparing column indices to diagonal indices
-        for (i = 0; i < LUmat->Nb; i++) {
-            int offsetL = Lmat->rowPointers[i];
-            int rowSize = diagIndex[i] - LUmat->rowPointers[i];
-            int offsetLU = LUmat->rowPointers[i];
-            memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
-            memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
-            offsetL += rowSize;
-            Lmat->rowPointers[i + 1] = offsetL;
-        }
-        // Reverse the order or the (blocked) rows for the U matrix,
-        // because the rows are accessed in reverse order when applying the ILU0
-        int URowIndex = 0;
-        for (i = LUmat->Nb - 1; i >= 0; i--) {
-            int offsetU = Umat->rowPointers[URowIndex];
-            int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1;
-            int offsetLU = diagIndex[i] + 1;
-            memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
-            memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
-            offsetU += rowSize;
-            Umat->rowPointers[URowIndex + 1] = offsetU;
-            URowIndex++;
-        }
-#endif
-        if (verbosity >= 3) {
-            std::ostringstream out;
-            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
         Timer t_copyToGpu;
-        if (pattern_uploaded == false) {
-            queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
-            queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
-            queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
-            queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
-            pattern_uploaded = true;
+
+        events.resize(1);
+        queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+
+        std::call_once(pattern_uploaded, [&](){
+            // find the positions of each diagonal block
+            // must be done after reordering
+            for (int row = 0; row < Nb; ++row) {
+                int rowStart = LUmat->rowPointers[row];
+                int rowEnd = LUmat->rowPointers[row+1];
+
+                auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+                assert(candidate != LUmat->colIndices + rowEnd);
+                diagIndex[row] = candidate - LUmat->colIndices;
+            }
+            events.resize(4);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]);
+            queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
+            queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+        });
+
+        cl::WaitForEvents(events);
+        events.clear();
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
         }
-        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
-        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
-        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
+
         if (verbosity >= 3) {
             std::ostringstream out;
             out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
             OpmLog::info(out.str());
         }
 
+        Timer t_decomposition;
+        std::ostringstream out;
+        cl::Event event;
+        for (int color = 0; color < numColors; ++color) {
+            const unsigned int firstRow = rowsPerColorPrefix[color];
+            const unsigned int lastRow = rowsPerColorPrefix[color+1];
+            const unsigned int work_group_size2 = 128;
+            const unsigned int num_work_groups2 = 1024;
+            const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
+            const unsigned int num_hwarps_per_group = work_group_size2 / 16;
+            const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double);           // each block needs a pivot
+            if (verbosity >= 4) {
+                out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+            }
+            event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
+            event.wait();
+        }
+
+        if (verbosity >= 3) {
+            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+            OpmLog::info(out.str());
+        }
+#endif // CHOW_PATEL
+
         return true;
     } // end create_preconditioner()
 
@@ -604,16 +611,24 @@ namespace bda
         Timer t_apply;
 
         for(int color = 0; color < numColors; ++color){
-            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#if CHOW_PATEL
+            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#else
+            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#endif
             // event.wait();
         }
 
         for(int color = numColors-1; color >= 0; --color){
-            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#if CHOW_PATEL
+            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#else
+            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#endif
             // event.wait();
         }
 
-        if (verbosity >= 3) {
+        if (verbosity >= 4) {
             event.wait();
             std::ostringstream out;
             out << "BILU0 apply: " << t_apply.stop() << " s";
@@ -638,11 +653,13 @@ namespace bda
     }
     template <unsigned int block_size>
     void BILU0<block_size>::setKernels(
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
+        cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
     ){
         this->ILU_apply1 = ILU_apply1_;
         this->ILU_apply2 = ILU_apply2_;
+        this->ilu_decomp_k = ilu_decomp_k_;
     }
 
 
@@ -657,8 +674,9 @@ template void BILU0<n>::setOpenCLContext(cl::Context*);
 template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);                               \
 template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int);   \
 template void BILU0<n>::setKernels(                                                      \
-    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
-    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *  \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
+    cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *                 \
     );
 
 INSTANTIATE_BDA_FUNCTIONS(1);
diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp
index 67d945851..aeca4817e 100644
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -20,12 +20,27 @@
 #ifndef BILU0_HPP
 #define BILU0_HPP
 
+#include <mutex>
+
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 
 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
 
+// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
+// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
+//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
+// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
+// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
+// the exact decomposition uses a full matrix LU which is the superposition of L and U
+// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
+#define CHOW_PATEL     0
+#define CHOW_PATEL_GPU 1
+
+
 namespace bda
 {
 
@@ -40,33 +55,48 @@ namespace bda
         int Nb;      // number of blockrows of the matrix
         int nnz;     // number of nonzeroes of the matrix (scalar)
         int nnzbs;   // number of blocks of the matrix
-        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr, LUmat = nullptr;
+        std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
         std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
+#if CHOW_PATEL
+        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
+#endif
         double *invDiagVals = nullptr;
         int *diagIndex = nullptr;
-        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
+        std::vector<int> rowsPerColor;        // color i contains rowsPerColor[i] rows, which are processed in parallel
+        std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
         int *toOrder = nullptr, *fromOrder = nullptr;
         int numColors;
         int verbosity;
+        std::once_flag pattern_uploaded;
 
         ILUReorder opencl_ilu_reorder;
 
         typedef struct {
-            cl::Buffer Lvals, Uvals, invDiagVals;
-            cl::Buffer Lcols, Lrows;
-            cl::Buffer Ucols, Urows;
+            cl::Buffer invDiagVals;
+            cl::Buffer diagIndex;
             cl::Buffer rowsPerColor;
+#if CHOW_PATEL
+            cl::Buffer Lvals, Lcols, Lrows;
+            cl::Buffer Uvals, Ucols, Urows;
+#else
+            cl::Buffer LUvals, LUcols, LUrows;
+#endif
         } GPU_storage;
 
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
+        ilu_apply1_kernel_type *ILU_apply1;
+        ilu_apply2_kernel_type *ILU_apply2;
+        cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&,
+                                        const int, cl::LocalSpaceArg> *ilu_decomp_k;
+
         GPU_storage s;
         cl::Context *context;
         cl::CommandQueue *queue;
+        std::vector<cl::Event> events;
+        cl_int err;
         int work_group_size = 0;
         int total_work_items = 0;
         int lmem_per_work_group = 0;
-        bool pattern_uploaded = false;
 
         ChowPatelIlu chowPatelIlu;
 
@@ -91,8 +121,9 @@ namespace bda
         void setOpenCLQueue(cl::CommandQueue *queue);
         void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
         void setKernels(
-            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
-            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
+            cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k
             );
 
         int* getToOrder()
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index 39d403cb7..9f1bbca16 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -28,6 +28,19 @@
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 
+#if HAVE_CUDA
+#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#endif
+
+#if HAVE_OPENCL
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#endif
+
+#if HAVE_FPGA
+#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
+#endif
+
+
 #define PRINT_TIMERS_BRIDGE 0
 
 typedef Dune::InverseOperatorResult InverseOperatorResult;
@@ -41,7 +54,8 @@ namespace Opm
     using bda::ILUReorder;
 
 template <class BridgeMatrix, class BridgeVector, int block_size>
-BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
+BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
+: gpu_mode(gpu_mode_)
 {
     if (gpu_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
@@ -224,6 +238,19 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector
     }
 }
 
+template <class BridgeMatrix, class BridgeVector, int block_size>
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
+    if(gpu_mode.compare("opencl") == 0){
+#if HAVE_OPENCL
+        const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
+        wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
+#else
+        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
+#endif
+    }
+}
+
+
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
 template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,              \
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
@@ -241,6 +268,11 @@ template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::al
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
 n>::get_result                                                                                                                      \
 (Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >&);                                 \
+                                                                                                                                    \
+template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,         \
+Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
+n>::initWellContributions(WellContributions&)
+
 
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index 962acc9b4..a8d3bcc1a 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -25,16 +25,10 @@
 #include "dune/istl/bcrsmatrix.hh"
 #include <opm/simulators/linalg/matrixblock.hh>
 
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 
-#if HAVE_CUDA
-#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
-#endif
-
-#if HAVE_OPENCL
-#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
-#endif
 
 namespace Opm
 {
@@ -48,6 +42,7 @@ class BdaBridge
 {
 private:
     bool use_gpu = false;
+    std::string gpu_mode;
     std::unique_ptr<bda::BdaSolver<block_size> > backend;
 
 public:
@@ -79,9 +74,11 @@ public:
         return use_gpu;
     }
 
-    const bda::BdaSolver<block_size>& getBackend() const {
-        return *backend;
-    }
+    /// Initialize the WellContributions object with opencl context and queue
+    /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
+    /// \param[in] wellContribs   container to hold all WellContributions
+    void initWellContributions(WellContributions& wellContribs);
+
 
 }; // end class BdaBridge
 
diff --git a/opm/simulators/linalg/bda/ChowPatelIlu.cpp b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
index 894b3f05c..b1ac6448b 100644
--- a/opm/simulators/linalg/bda/ChowPatelIlu.cpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
@@ -529,7 +529,7 @@ void ChowPatelIlu::decomposition(
             err |= queue->enqueueWriteBuffer(d_LU_cols, CL_FALSE, 0, sizeof(int) * LU_nnzbs, LU_cols, nullptr, &events[5]);
             cl::WaitForEvents(events);
             events.clear();
-            if (verbosity >= 3){
+            if (verbosity >= 4){
                 std::ostringstream out;
                 out << "ChowPatelIlu copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
                 OpmLog::info(out.str());
@@ -550,7 +550,7 @@ void ChowPatelIlu::decomposition(
         err |= queue->enqueueWriteBuffer(d_LU_vals, CL_FALSE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals, nullptr, &events[2]);
         cl::WaitForEvents(events);
         events.clear();
-        if (verbosity >= 3){
+        if (verbosity >= 4){
             std::ostringstream out;
             out << "ChowPatelIlu copy1 time: " << t_copy1.stop() << " s";
             OpmLog::info(out.str());
@@ -585,7 +585,7 @@ void ChowPatelIlu::decomposition(
                 d_Ut_idxs, d_L_cols, d_LU_cols,
                 *Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group));
             event.wait();
-            if (verbosity >= 3){
+            if (verbosity >= 4){
                 std::ostringstream out;
                 out << "ChowPatelIlu sweep kernel time: " << t_kernel.stop() << " s";
                 OpmLog::info(out.str());
@@ -604,7 +604,7 @@ void ChowPatelIlu::decomposition(
         }
         cl::WaitForEvents(events);
         events.clear();
-        if (verbosity >= 3){
+        if (verbosity >= 4){
             std::ostringstream out;
             out << "ChowPatelIlu copy2 time: " << t_copy2.stop() << " s";
             OpmLog::info(out.str());
diff --git a/opm/simulators/linalg/bda/Reorder.cpp b/opm/simulators/linalg/bda/Reorder.cpp
index b7eebee57..cae3b35f9 100644
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@@ -201,11 +201,10 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
 
 void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
     int reordered = 0;
-    int i, c;
 
     // Find reordering patterns
-    for (c = 0; c < numColors; c++) {
-        for (i = 0; i < Nb; i++) {
+    for (int c = 0; c < numColors; c++) {
+        for (int i = 0; i < Nb; i++) {
             if (colors[i] == c) {
                 rowsPerColor[c]++;
                 toOrder[i] = reordered;
@@ -259,10 +258,11 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
     int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0;
     int thisRow;
     std::vector<bool> doneRows(Nb, false);
-    rowsPerColor.reserve(Nb);
-
     std::vector <int> rowsToStart;
 
+    // since emplace_back() is used to fill, the vector must be empty
+    assert(rowsPerColor.size() == 0);
+
     // find starting rows: rows that are independent from all rows that come before them.
     for (thisRow = 0; thisRow < Nb; thisRow++) {
         if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) {
@@ -312,8 +312,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
 
 template <unsigned int block_size>
 void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
-    std::vector<int> rowColor;
-    rowColor.resize(Nb);
+    std::vector<int> rowColor(Nb);
 
     *numColors = colorBlockedNodes<block_size>(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor);
 
diff --git a/opm/simulators/linalg/bda/Reorder.hpp b/opm/simulators/linalg/bda/Reorder.hpp
index 7ae2c90db..08c8716ff 100644
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@@ -90,7 +90,7 @@ bool canBeStarted(const int rowIndex, const  int *rowPointers, const  int *colIn
 /// \param[out] numColors       a pointer to the number of colors needed for the level scheduling
 /// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
 /// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color
+/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
 void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
 
 /// Find a graph coloring reordering for an input matrix
@@ -105,7 +105,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
 /// \param[out] numColors       the number of colors used in the found graph coloring
 /// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
 /// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color
+/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color, this function will resize()
 template <unsigned int block_size>
 void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor);
 
diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index f17c53c3b..7d9c4dc0b 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -22,7 +22,7 @@
 #include <cstring>
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 
 namespace Opm
@@ -74,7 +74,7 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
     this->queue = queue_;
 }
 
-void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
+void WellContributions::setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
     this->kernel = kernel_;
     this->kernel_no_reorder = kernel_no_reorder_;
 }
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index f3f8e3fd2..00addf546 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -26,6 +26,7 @@
 
 #if HAVE_OPENCL
 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
 #endif
 
 #include <vector>
@@ -39,6 +40,9 @@
 namespace Opm
 {
 
+using bda::stdwell_apply_kernel_type;
+using bda::stdwell_apply_no_reorder_kernel_type;
+
 /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
 /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
 /// So far, StandardWell and MultisegmentWell are supported
@@ -92,17 +96,10 @@ private:
     std::vector<MultisegmentWellContribution*> multisegments;
 
 #if HAVE_OPENCL
-    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                            cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
-                            cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
-    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                            const unsigned int, const unsigned int, cl::Buffer&,
-                            cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
-
     cl::Context *context;
     cl::CommandQueue *queue;
-    kernel_type *kernel;
-    kernel_type_no_reorder *kernel_no_reorder;
+    stdwell_apply_kernel_type *kernel;
+    stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
     std::vector<cl::Event> events;
 
     std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
@@ -154,7 +151,7 @@ public:
 #endif
 
 #if HAVE_OPENCL
-    void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
+    void setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
     void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
 
     /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
diff --git a/opm/simulators/linalg/bda/openclKernels.cpp b/opm/simulators/linalg/bda/openclKernels.cpp
new file mode 100644
index 000000000..c82339035
--- /dev/null
+++ b/opm/simulators/linalg/bda/openclKernels.cpp
@@ -0,0 +1,621 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
+namespace bda
+{
+
+    std::string get_axpy_string() {
+        return R"(
+        __kernel void axpy(
+            __global double *in,
+            const double a,
+            __global double *out,
+            const int N)
+        {
+            unsigned int NUM_THREADS = get_global_size(0);
+            int idx = get_global_id(0);
+
+            while(idx < N){
+                out[idx] += a * in[idx];
+                idx += NUM_THREADS;
+            }
+        }
+        )";
+    }
+
+
+    // returns partial sums, instead of the final dot product
+    std::string get_dot_1_string() {
+        return R"(
+        __kernel void dot_1(
+            __global double *in1,
+            __global double *in2,
+            __global double *out,
+            const unsigned int N,
+            __local double *tmp)
+        {
+            unsigned int tid = get_local_id(0);
+            unsigned int bsize = get_local_size(0);
+            unsigned int bid = get_global_id(0) / bsize;
+            unsigned int i = get_global_id(0);
+            unsigned int NUM_THREADS = get_global_size(0);
+
+            double sum = 0.0;
+            while(i < N){
+                sum += in1[i] * in2[i];
+                i += NUM_THREADS;
+            }
+            tmp[tid] = sum;
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // do reduction in shared mem
+            for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+            {
+                if (tid < s)
+                {
+                    tmp[tid] += tmp[tid + s];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            // write result for this block to global mem
+            if (tid == 0) out[get_group_id(0)] = tmp[0];
+        }
+        )";
+    }
+
+
+    // returns partial sums, instead of the final norm
+    // the square root must be computed on CPU
+    std::string get_norm_string() {
+        return R"(
+        __kernel void norm(
+            __global double *in,
+            __global double *out,
+            const unsigned int N,
+            __local double *tmp)
+        {
+            unsigned int tid = get_local_id(0);
+            unsigned int bsize = get_local_size(0);
+            unsigned int bid = get_global_id(0) / bsize;
+            unsigned int i = get_global_id(0);
+            unsigned int NUM_THREADS = get_global_size(0);
+
+            double local_sum = 0.0;
+            while(i < N){
+                local_sum += in[i] * in[i];
+                i += NUM_THREADS;
+            }
+            tmp[tid] = local_sum;
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // do reduction in shared mem
+            for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+            {
+                if (tid < s)
+                {
+                    tmp[tid] += tmp[tid + s];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            // write result for this block to global mem
+            if (tid == 0) out[get_group_id(0)] = tmp[0];
+        }
+        )";
+    }
+
+
+    // p = (p - omega * v) * beta + r
+    std::string get_custom_string() {
+        return R"(
+        __kernel void custom(
+            __global double *p,
+            __global double *v,
+            __global double *r,
+            const double omega,
+            const double beta,
+            const int N)
+        {
+            const unsigned int NUM_THREADS = get_global_size(0);
+            unsigned int idx = get_global_id(0);
+
+            while(idx < N){
+                double res = p[idx];
+                res -= omega * v[idx];
+                res *= beta;
+                res += r[idx];
+                p[idx] = res;
+                idx += NUM_THREADS;
+            }
+        }
+        )";
+    }
+
+    std::string get_spmv_blocked_string() {
+        return R"(
+        __kernel void spmv_blocked(
+            __global const double *vals,
+            __global const int *cols,
+            __global const int *rows,
+            const int Nb,
+            __global const double *x,
+            __global double *b,
+            const unsigned int block_size,
+            __local double *tmp)
+        {
+            const unsigned int warpsize = 32;
+            const unsigned int bsize = get_local_size(0);
+            const unsigned int idx_b = get_global_id(0) / bsize;
+            const unsigned int idx_t = get_local_id(0);
+            unsigned int idx = idx_b * bsize + idx_t;
+            const unsigned int bs = block_size;
+            const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+            const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+            const unsigned int NUM_THREADS = get_global_size(0);
+            const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+            unsigned int target_block_row = idx / warpsize;
+            const unsigned int lane = idx_t % warpsize;
+            const unsigned int c = (lane / bs) % bs;
+            const unsigned int r = lane % bs;
+
+            // for 3x3 blocks:
+            // num_active_threads: 27
+            // num_blocks_per_warp: 3
+
+            while(target_block_row < Nb){
+                unsigned int first_block = rows[target_block_row];
+                unsigned int last_block = rows[target_block_row+1];
+                unsigned int block = first_block + lane / (bs*bs);
+                double local_out = 0.0;
+
+                if(lane < num_active_threads){
+                    for(; block < last_block; block += num_blocks_per_warp){
+                        double x_elem = x[cols[block]*bs + c];
+                        double A_elem = vals[block*bs*bs + c + r*bs];
+                        local_out += x_elem * A_elem;
+                    }
+                }
+
+                // do reduction in shared mem
+                tmp[lane] = local_out;
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+                {
+                    if (lane + offset < warpsize)
+                    {
+                        tmp[lane] += tmp[lane + offset];
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                }
+
+                if(lane < bs){
+                    unsigned int row = target_block_row*bs + lane;
+                    b[row] = tmp[lane];
+                }
+                target_block_row += num_warps_in_grid;
+            }
+        }
+        )";
+    }
+
+
+
+    std::string get_ILU_apply1_string(bool full_matrix) {
+        std::string s = R"(
+            __kernel void ILU_apply1(
+                __global const double *LUvals,
+                __global const unsigned int *LUcols,
+                __global const unsigned int *LUrows,
+                __global const int *diagIndex,
+                __global const double *y,
+                __global double *x,
+                __global const unsigned int *nodesPerColorPrefix,
+                const unsigned int color,
+                const unsigned int block_size,
+                __local double *tmp)
+            {
+                const unsigned int warpsize = 32;
+                const unsigned int bs = block_size;
+                const unsigned int idx_t = get_local_id(0);
+                const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+                const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+                const unsigned int NUM_THREADS = get_global_size(0);
+                const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+                unsigned int idx = get_global_id(0);
+                unsigned int target_block_row = idx / warpsize;
+                target_block_row += nodesPerColorPrefix[color];
+                const unsigned int lane = idx_t % warpsize;
+                const unsigned int c = (lane / bs) % bs;
+                const unsigned int r = lane % bs;
+
+                while(target_block_row < nodesPerColorPrefix[color+1]){
+                    const unsigned int first_block = LUrows[target_block_row];
+                    )";
+        if (full_matrix) {
+            s += "const unsigned int last_block = diagIndex[target_block_row];  ";
+        } else {
+            s += "const unsigned int last_block = LUrows[target_block_row+1];  ";
+        }
+        s += R"(
+                    unsigned int block = first_block + lane / (bs*bs);
+                    double local_out = 0.0;
+                    if(lane < num_active_threads){
+                        if(lane < bs){
+                            local_out = y[target_block_row*bs+lane];
+                        }
+                        for(; block < last_block; block += num_blocks_per_warp){
+                            const double x_elem = x[LUcols[block]*bs + c];
+                            const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                            local_out -= x_elem * A_elem;
+                        }
+                    }
+
+                    // do reduction in shared mem
+                    tmp[lane] = local_out;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+                    {
+                        if (lane + offset < warpsize)
+                        {
+                            tmp[lane] += tmp[lane + offset];
+                        }
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                    }
+
+                    if(lane < bs){
+                        const unsigned int row = target_block_row*bs + lane;
+                        x[row] = tmp[lane];
+                    }
+
+                    target_block_row += num_warps_in_grid;
+                }
+            }
+            )";
+        return s;
+    }
+
+
+    std::string get_ILU_apply2_string(bool full_matrix) {
+        std::string s = R"(
+            __kernel void ILU_apply2(
+                __global const double *LUvals,
+                __global const int *LUcols,
+                __global const int *LUrows,
+                __global const int *diagIndex,
+                __global const double *invDiagVals,
+                __global double *x,
+                __global const unsigned int *nodesPerColorPrefix,
+                const unsigned int color,
+                const unsigned int block_size,
+                __local double *tmp)
+            {
+                const unsigned int warpsize = 32;
+                const unsigned int bs = block_size;
+                const unsigned int idx_t = get_local_id(0);
+                const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+                const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+                const unsigned int NUM_THREADS = get_global_size(0);
+                const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+                unsigned int idx = get_global_id(0);
+                unsigned int target_block_row = idx / warpsize;
+                target_block_row += nodesPerColorPrefix[color];
+                const unsigned int lane = idx_t % warpsize;
+                const unsigned int c = (lane / bs) % bs;
+                const unsigned int r = lane % bs;
+                const double relaxation = 0.9;
+
+                while(target_block_row < nodesPerColorPrefix[color+1]){
+                    )";
+        if (full_matrix) {
+            s +=   "const unsigned int first_block = diagIndex[target_block_row] + 1;  ";
+        } else {
+            s +=   "const unsigned int first_block = LUrows[target_block_row];  ";
+        }
+        s += R"(
+                    const unsigned int last_block = LUrows[target_block_row+1];
+                    unsigned int block = first_block + lane / (bs*bs);
+                    double local_out = 0.0;
+                    if(lane < num_active_threads){
+                        if(lane < bs){
+                            const unsigned int row = target_block_row*bs+lane;
+                            local_out = x[row];
+                        }
+                        for(; block < last_block; block += num_blocks_per_warp){
+                            const double x_elem = x[LUcols[block]*bs + c];
+                            const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                            local_out -= x_elem * A_elem;
+                        }
+                    }
+
+                    // do reduction in shared mem
+                    tmp[lane] = local_out;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+                    {
+                        if (lane + offset < warpsize)
+                        {
+                            tmp[lane] += tmp[lane + offset];
+                        }
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                    }
+                    local_out = tmp[lane];
+
+                    if(lane < bs){
+                        tmp[lane + bs*idx_t/warpsize] = local_out;
+                        double sum = 0.0;
+                        for(int i = 0; i < bs; ++i){
+                            sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                        }
+
+                        const unsigned int row = target_block_row*bs + lane;
+                        x[row] = relaxation * sum;
+                    }
+
+                    target_block_row += num_warps_in_grid;
+                }
+            }
+        )";
+        return s;
+    }
+
+    std::string get_stdwell_apply_string(bool reorder) {
+        std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
+        std::string s = "__kernel void " + kernel_name + R"((
+                        __global const double *Cnnzs,
+                        __global const double *Dnnzs,
+                        __global const double *Bnnzs,
+                        __global const int *Ccols,
+                        __global const int *Bcols,
+                        __global const double *x,
+                        __global double *y,
+                        )";
+        if (reorder) {
+            s +=     R"(__global const int *toOrder,
+                        )";
+        }
+        s +=         R"(const unsigned int dim,
+                        const unsigned int dim_wells,
+                        __global const unsigned int *val_pointers,
+                        __local double *localSum,
+                        __local double *z1,
+                        __local double *z2){
+                int wgId = get_group_id(0);
+                int wiId = get_local_id(0);
+                int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+                int valsPerBlock = dim*dim_wells;
+                int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
+                int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
+                int c = wiId % dim;
+                int r = (wiId/dim) % dim_wells;
+                double temp;
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                localSum[wiId] = 0;
+                if(wiId < numActiveWorkItems){
+                    int b = wiId/valsPerBlock + val_pointers[wgId];
+                    while(b < valSize + val_pointers[wgId]){
+                        )";
+        if (reorder) {
+            s +=       "int colIdx = toOrder[Bcols[b]];  ";
+        } else {
+            s +=       "int colIdx = Bcols[b];  ";
+        }
+        s += R"(
+                        localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
+                        b += numBlocksPerWarp;
+                    }
+
+                    if(wiId < valsPerBlock){
+                        localSum[wiId] += localSum[wiId + valsPerBlock];
+                    }
+
+                    b = wiId/valsPerBlock + val_pointers[wgId];
+
+                    if(c == 0 && wiId < valsPerBlock){
+                        for(unsigned int stride = 2; stride > 0; stride >>= 1){
+                            localSum[wiId] += localSum[wiId + stride];
+                        }
+                        z1[r] = localSum[wiId];
+                    }
+                }
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if(wiId < dim_wells){
+                    temp = 0.0;
+                    for(unsigned int i = 0; i < dim_wells; ++i){
+                        temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
+                    }
+                    z2[wiId] = temp;
+                }
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if(wiId < dim*valSize){
+                    temp = 0.0;
+                    int bb = wiId/dim + val_pointers[wgId];
+                    for (unsigned int j = 0; j < dim_wells; ++j){
+                        temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
+                    }
+                    )";
+        if (reorder) {
+            s +=   "int colIdx = toOrder[Ccols[bb]];  ";
+        } else {
+            s +=   "int colIdx = Ccols[bb];  ";
+        }
+        s += R"(
+                    y[colIdx*dim + c] -= temp;
+                }
+            }
+            )";
+        return s;
+    }
+
+
+    std::string get_ilu_decomp_string() {
+        return R"(
+
+        // a = a - (b * c)
+        __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
+        {
+            const unsigned int block_size = 3;
+            const unsigned int hwarp_size = 16;
+            const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+            const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+            if(thread_id_in_hwarp < block_size * block_size){
+                const unsigned int row = thread_id_in_hwarp / block_size;
+                const unsigned int col = thread_id_in_hwarp % block_size;
+                double temp = 0.0;
+                for (unsigned int k = 0; k < block_size; k++) {
+                    temp += b[block_size * row + k] * c[block_size * k + col];
+                }
+                a[block_size * row + col] -= temp;
+            }
+        }
+
+        // c = a * b
+        __kernel void block_mult(__global double *a, __global double *b, __local double *c) {
+            const unsigned int block_size = 3;
+            const unsigned int hwarp_size = 16;
+            const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+            const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+            if(thread_id_in_hwarp < block_size * block_size){
+                const unsigned int row = thread_id_in_hwarp / block_size;
+                const unsigned int col = thread_id_in_hwarp % block_size;
+                double temp = 0.0;
+                for (unsigned int k = 0; k < block_size; k++) {
+                    temp += a[block_size * row + k] * b[block_size * k + col];
+                }
+                c[block_size * row + col] = temp;
+            }
+        }
+
+        // invert 3x3 matrix
+        __kernel void inverter(__global double *matrix, __global double *inverse) {
+            const unsigned int block_size = 3;
+            const unsigned int bs = block_size;                           // rename to shorter name
+            const unsigned int hwarp_size = 16;
+            const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+            const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+            if(thread_id_in_hwarp < bs * bs){
+                double t4  = matrix[0] * matrix[4];
+                double t6  = matrix[0] * matrix[5];
+                double t8  = matrix[1] * matrix[3];
+                double t10 = matrix[2] * matrix[3];
+                double t12 = matrix[1] * matrix[6];
+                double t14 = matrix[2] * matrix[6];
+
+                double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                              t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+                double t17 = 1.0 / det;
+
+                const unsigned int r = thread_id_in_hwarp / bs;
+                const unsigned int c = thread_id_in_hwarp % bs;
+                const unsigned int r1 = (r+1) % bs;
+                const unsigned int c1 = (c+1) % bs;
+                const unsigned int r2 = (r+bs-1) % bs;
+                const unsigned int c2 = (c+bs-1) % bs;
+                inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
+            }
+        }
+
+        __kernel void ilu_decomp(const unsigned int firstRow,
+                                 const unsigned int lastRow,
+                                 __global double *LUvals,
+                                 __global const int *LUcols,
+                                 __global const int *LUrows,
+                                 __global double *invDiagVals,
+                                 __global int *diagIndex,
+                                 const unsigned int Nb,
+                                 __local double *pivot){
+
+            const unsigned int bs = 3;
+            const unsigned int hwarp_size = 16;
+            const unsigned int work_group_size = get_local_size(0);
+            const unsigned int work_group_id = get_group_id(0);
+            const unsigned int num_groups = get_num_groups(0);
+            const unsigned int hwarps_per_group = work_group_size / hwarp_size;
+            const unsigned int thread_id_in_group = get_local_id(0);      // thread id in work group
+            const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size;     // thread id in hwarp (16 threads)
+            const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
+            const unsigned int lmem_offset = hwarp_id_in_group * bs * bs;  // each workgroup gets some lmem, but the workitems have to share it
+                                                                           // every workitem in a hwarp has the same lmem_offset
+
+            // go through all rows
+            for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
+            {
+                int iRowStart = LUrows[i];
+                int iRowEnd = LUrows[i + 1];
+
+                // go through all elements of the row
+                for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                    int j = LUcols[ij];
+
+                    if (j < i) {
+                        // calculate the pivot of this row
+                        block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
+
+                        // copy pivot
+                        if (thread_id_in_hwarp < bs * bs) {
+                            LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
+                        }
+
+                        int jRowEnd = LUrows[j + 1];
+                        int jk = diagIndex[j] + 1;
+                        int ik = ij + 1;
+                        // substract that row scaled by the pivot from this row.
+                        while (ik < iRowEnd && jk < jRowEnd) {
+                            if (LUcols[ik] == LUcols[jk]) {
+                                block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
+                                ik++;
+                                jk++;
+                            } else {
+                                if (LUcols[ik] < LUcols[jk])
+                                { ik++; }
+                                else
+                                { jk++; }
+                            }
+                        }
+                    }
+                }
+
+                // store the inverse in the diagonal
+                inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
+
+                // copy inverse
+                if (thread_id_in_hwarp < bs * bs) {
+                    LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
+                }
+
+            }
+        }
+        )";
+    }
+
+} // end namespace bda
+
diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp
index e3ffadf60..a408cf9ec 100644
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -20,498 +20,76 @@
 #ifndef OPENCL_HPP
 #define OPENCL_HPP
 
+#include <string>
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+
 namespace bda
 {
 
-    inline const char* axpy_s  = R"(
-    __kernel void axpy(
-        __global double *in,
-        const double a,
-        __global double *out,
-        const int N)
-    {
-        unsigned int NUM_THREADS = get_global_size(0);
-        int idx = get_global_id(0);
+using spmv_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
+                                         cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
+using ilu_apply1_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
+using ilu_apply2_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
+using stdwell_apply_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                  const unsigned int, const unsigned int, cl::Buffer&,
+                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
+using stdwell_apply_no_reorder_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                             cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                             const unsigned int, const unsigned int, cl::Buffer&,
+                                                             cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
+using ilu_decomp_kernel_type = cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
+                                               cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
 
-        while(idx < N){
-            out[idx] += a * in[idx];
-            idx += NUM_THREADS;
-        }
-    }
-    )";
+    /// Generate string with axpy kernel
+    /// a = a + alpha * b
+    std::string get_axpy_string();
 
+    /// returns partial sums, instead of the final dot product
+    /// partial sums are added on CPU
+    std::string get_dot_1_string();
 
-    // returns partial sums, instead of the final dot product
-    inline const char* dot_1_s = R"(
-    __kernel void dot_1(
-        __global double *in1,
-        __global double *in2,
-        __global double *out,
-        const unsigned int N,
-        __local double *tmp)
-    {
-        unsigned int tid = get_local_id(0);
-        unsigned int bsize = get_local_size(0);
-        unsigned int bid = get_global_id(0) / bsize;
-        unsigned int i = get_global_id(0);
-        unsigned int NUM_THREADS = get_global_size(0);
+    /// returns partial sums, instead of the final norm
+    /// the square root must be computed on CPU
+    std::string get_norm_string();
 
-        double sum = 0.0;
-        while(i < N){
-            sum += in1[i] * in2[i];
-            i += NUM_THREADS;
-        }
-        tmp[tid] = sum;
+    /// Generate string with custom kernel
+    /// This kernel combines some ilubicgstab vector operations into 1
+    /// p = (p - omega * v) * beta + r
+    std::string get_custom_string();
 
-        barrier(CLK_LOCAL_MEM_FENCE);
+    /// b = mat * x
+    /// algorithm based on:
+    /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+    /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+    std::string get_spmv_blocked_string();
 
-        // do reduction in shared mem
-        for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
-        {
-            if (tid < s)
-            {
-                tmp[tid] += tmp[tid + s];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
+    /// ILU apply part 1: forward substitution
+    /// solves L*x=y where L is a lower triangular sparse blocked matrix
+    /// this L can be it's own BSR matrix (if full_matrix is false),
+    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
+    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
+    std::string get_ILU_apply1_string(bool full_matrix);
 
-        // write result for this block to global mem
-        if (tid == 0) out[get_group_id(0)] = tmp[0];
-    }
-    )";
+    /// ILU apply part 2: backward substitution
+    /// solves U*x=y where U is an upper triangular sparse blocked matrix
+    /// this U can be it's own BSR matrix (if full_matrix is false),
+    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
+    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
+    std::string get_ILU_apply2_string(bool full_matrix);
 
+    /// Generate string with the stdwell_apply kernels
+    /// If reorder is true, the B/Ccols do not correspond with the x/y vector
+    /// the x/y vector is reordered, use toOrder to address that
+    /// \param[in] reorder   whether the matrix is reordered or not
+    std::string get_stdwell_apply_string(bool reorder);
 
-    // returns partial sums, instead of the final norm
-    // the square root must be computed on CPU
-    inline const char* norm_s = R"(
-    __kernel void norm(
-        __global double *in,
-        __global double *out,
-        const unsigned int N,
-        __local double *tmp)
-    {
-        unsigned int tid = get_local_id(0);
-        unsigned int bsize = get_local_size(0);
-        unsigned int bid = get_global_id(0) / bsize;
-        unsigned int i = get_global_id(0);
-        unsigned int NUM_THREADS = get_global_size(0);
-
-        double local_sum = 0.0;
-        while(i < N){
-            local_sum += in[i] * in[i];
-            i += NUM_THREADS;
-        }
-        tmp[tid] = local_sum;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // do reduction in shared mem
-        for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
-        {
-            if (tid < s)
-            {
-                tmp[tid] += tmp[tid + s];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        // write result for this block to global mem
-        if (tid == 0) out[get_group_id(0)] = tmp[0];
-    }
-    )";
-
-
-    // p = (p - omega * v) * beta + r
-    inline const char* custom_s = R"(
-    __kernel void custom(
-        __global double *p,
-        __global double *v,
-        __global double *r,
-        const double omega,
-        const double beta,
-        const int N)
-    {
-        const unsigned int NUM_THREADS = get_global_size(0);
-        unsigned int idx = get_global_id(0);
-
-        while(idx < N){
-            double res = p[idx];
-            res -= omega * v[idx];
-            res *= beta;
-            res += r[idx];
-            p[idx] = res;
-            idx += NUM_THREADS;
-        }
-    }
-    )";
-
-
-    // b = mat * x
-    // algorithm based on:
-    // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
-    // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
-    inline const char* spmv_blocked_s = R"(
-    __kernel void spmv_blocked(
-        __global const double *vals,
-        __global const int *cols,
-        __global const int *rows,
-        const int Nb,
-        __global const double *x,
-        __global double *b,
-        const unsigned int block_size,
-        __local double *tmp)
-    {
-        const unsigned int warpsize = 32;
-        const unsigned int bsize = get_local_size(0);
-        const unsigned int idx_b = get_global_id(0) / bsize;
-        const unsigned int idx_t = get_local_id(0);
-        unsigned int idx = idx_b * bsize + idx_t;
-        const unsigned int bs = block_size;
-        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-        const unsigned int NUM_THREADS = get_global_size(0);
-        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-        unsigned int target_block_row = idx / warpsize;
-        const unsigned int lane = idx_t % warpsize;
-        const unsigned int c = (lane / bs) % bs;
-        const unsigned int r = lane % bs;
-
-        // for 3x3 blocks:
-        // num_active_threads: 27
-        // num_blocks_per_warp: 3
-
-        while(target_block_row < Nb){
-            unsigned int first_block = rows[target_block_row];
-            unsigned int last_block = rows[target_block_row+1];
-            unsigned int block = first_block + lane / (bs*bs);
-            double local_out = 0.0;
-
-            if(lane < num_active_threads){
-                for(; block < last_block; block += num_blocks_per_warp){
-                    double x_elem = x[cols[block]*bs + c];
-                    double A_elem = vals[block*bs*bs + c + r*bs];
-                    local_out += x_elem * A_elem;
-                }
-            }
-
-            // do reduction in shared mem
-            tmp[lane] = local_out;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
-            {
-                if (lane + offset < warpsize)
-                {
-                    tmp[lane] += tmp[lane + offset];
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-
-            if(lane < bs){
-                unsigned int row = target_block_row*bs + lane;
-                b[row] = tmp[lane];
-            }
-            target_block_row += num_warps_in_grid;
-        }
-    }
-    )";
-
-
-
-    // ILU apply part 1: forward substitution
-    // solves L*x=y where L is a lower triangular sparse blocked matrix
-    inline const char* ILU_apply1_s = R"(
-    __kernel void ILU_apply1(
-        __global const double *Lvals,
-        __global const unsigned int *Lcols,
-        __global const unsigned int *Lrows,
-        const unsigned int LrowSize,
-        __global const double *y,
-        __global double *x,
-        __global const unsigned int *nodesPerColorPrefix,
-        const unsigned int color,
-        const unsigned int block_size,
-        __local double *tmp)
-    {
-        const unsigned int warpsize = 32;
-        const unsigned int bs = block_size;
-        const unsigned int idx_t = get_local_id(0);
-        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-        const unsigned int NUM_THREADS = get_global_size(0);
-        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-        unsigned int idx = get_global_id(0);
-        unsigned int target_block_row = idx / warpsize;
-        idx += nodesPerColorPrefix[color];
-        target_block_row += nodesPerColorPrefix[color];
-        const unsigned int lane = idx_t % warpsize;
-        const unsigned int c = (lane / bs) % bs;
-        const unsigned int r = lane % bs;
-
-        while(target_block_row < nodesPerColorPrefix[color+1]){
-            const unsigned int first_block = Lrows[target_block_row];
-            const unsigned int last_block = Lrows[target_block_row+1];
-            unsigned int block = first_block + lane / (bs*bs);
-            double local_out = 0.0;
-            if(lane < num_active_threads){
-                if(lane < bs){
-                    local_out = y[target_block_row*bs+lane];
-                }
-                for(; block < last_block; block += num_blocks_per_warp){
-                    const double x_elem = x[Lcols[block]*bs + c];
-                    const double A_elem = Lvals[block*bs*bs + c + r*bs];
-                    local_out -= x_elem * A_elem;
-                }
-            }
-
-            // do reduction in shared mem
-            tmp[lane] = local_out;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
-            {
-                if (lane + offset < warpsize)
-                {
-                    tmp[lane] += tmp[lane + offset];
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-
-            if(lane < bs){
-                const unsigned int row = target_block_row*bs + lane;
-                x[row] = tmp[lane];
-            }
-
-            target_block_row += num_warps_in_grid;
-        }
-    }
-    )";
-
-
-    // ILU apply part 2: backward substitution
-    // solves U*x=y where L is a lower triangular sparse blocked matrix
-    inline const char* ILU_apply2_s = R"(
-    __kernel void ILU_apply2(
-        __global const double *Uvals,
-        __global const int *Ucols,
-        __global const int *Urows,
-        const unsigned int UrowSize,
-        __global const double *invDiagVals,
-        __global double *x,
-        __global const unsigned int *nodesPerColorPrefix,
-        const unsigned int color,
-        const unsigned int block_size,
-        __local double *tmp)
-    {
-        const unsigned int warpsize = 32;
-        const unsigned int bs = block_size;
-        const unsigned int idx_t = get_local_id(0);
-        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-        const unsigned int NUM_THREADS = get_global_size(0);
-        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-        unsigned int idx = get_global_id(0);
-        unsigned int target_block_row = idx / warpsize;
-        idx += nodesPerColorPrefix[color];
-        target_block_row += nodesPerColorPrefix[color];
-        const unsigned int lane = idx_t % warpsize;
-        const unsigned int c = (lane / bs) % bs;
-        const unsigned int r = lane % bs;
-        const double relaxation = 0.9;
-
-        while(target_block_row < nodesPerColorPrefix[color+1]){
-            const unsigned int first_block = Urows[UrowSize-target_block_row-1];
-            const unsigned int last_block = Urows[UrowSize-target_block_row];
-            unsigned int block = first_block + lane / (bs*bs);
-            double local_out = 0.0;
-            if(lane < num_active_threads){
-                if(lane < bs){
-                    const unsigned int row = target_block_row*bs+lane;
-                    local_out = x[row];
-                }
-                for(; block < last_block; block += num_blocks_per_warp){
-                    const double x_elem = x[Ucols[block]*bs + c];
-                    const double A_elem = Uvals[block*bs*bs + c + r*bs];
-                    local_out -= x_elem * A_elem;
-                }
-            }
-
-            // do reduction in shared mem
-            tmp[lane] = local_out;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
-            {
-                if (lane + offset < warpsize)
-                {
-                    tmp[lane] += tmp[lane + offset];
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-            local_out = tmp[lane];
-
-            if(lane < bs){
-                tmp[lane + bs*idx_t/warpsize] = local_out;
-                double sum = 0.0;
-                for(int i = 0; i < bs; ++i){
-                    sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
-                }
-
-                const unsigned int row = target_block_row*bs + lane;
-                x[row] = relaxation * sum;
-            }
-
-            target_block_row += num_warps_in_grid;
-        }
-    }
-    )";
-
-    inline const char* stdwell_apply_s = R"(
-    __kernel void stdwell_apply(__global const double *Cnnzs,
-                                __global const double *Dnnzs,
-                                __global const double *Bnnzs,
-                                __global const int *Ccols,
-                                __global const int *Bcols,
-                                __global const double *x,
-                                __global double *y,
-                                __global const int *toOrder,
-                                const unsigned int dim,
-                                const unsigned int dim_wells,
-                                __global const unsigned int *val_pointers,
-                                __local double *localSum,
-                                __local double *z1,
-                                __local double *z2){
-        int wgId = get_group_id(0);
-        int wiId = get_local_id(0);
-        int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
-        int valsPerBlock = dim*dim_wells;
-        int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
-        int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
-        int c = wiId % dim;
-        int r = (wiId/dim) % dim_wells;
-        double temp;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        localSum[wiId] = 0;
-        if(wiId < numActiveWorkItems){
-            int b = wiId/valsPerBlock + val_pointers[wgId];
-            while(b < valSize + val_pointers[wgId]){
-                int colIdx = toOrder[Bcols[b]];
-                localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
-                b += numBlocksPerWarp;
-            }
-
-            if(wiId < valsPerBlock){
-                localSum[wiId] += localSum[wiId + valsPerBlock];
-            }
-
-            b = wiId/valsPerBlock + val_pointers[wgId];
-
-            if(c == 0 && wiId < valsPerBlock){
-                for(unsigned int stride = 2; stride > 0; stride >>= 1){
-                    localSum[wiId] += localSum[wiId + stride];
-                }
-                z1[r] = localSum[wiId];
-            }
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim_wells){
-            temp = 0.0;
-            for(unsigned int i = 0; i < dim_wells; ++i){
-                temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
-            }
-            z2[wiId] = temp;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim*valSize){
-            temp = 0.0;
-            int bb = wiId/dim + val_pointers[wgId];
-            for (unsigned int j = 0; j < dim_wells; ++j){
-                temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
-            }
-            int colIdx = toOrder[Ccols[bb]];
-            y[colIdx*dim + c] -= temp;
-        }
-    }
-    )";
-
-    inline const char* stdwell_apply_no_reorder_s = R"(
-    __kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
-                                __global const double *Dnnzs,
-                                __global const double *Bnnzs,
-                                __global const int *Ccols,
-                                __global const int *Bcols,
-                                __global const double *x,
-                                __global double *y,
-                                const unsigned int dim,
-                                const unsigned int dim_wells,
-                                __global const unsigned int *val_pointers,
-                                __local double *localSum,
-                                __local double *z1,
-                                __local double *z2){
-        int wgId = get_group_id(0);
-        int wiId = get_local_id(0);
-        int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
-        int valsPerBlock = dim*dim_wells;
-        int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
-        int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
-        int c = wiId % dim;
-        int r = (wiId/dim) % dim_wells;
-        double temp;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        localSum[wiId] = 0;
-        if(wiId < numActiveWorkItems){
-            int b = wiId/valsPerBlock + val_pointers[wgId];
-            while(b < valSize + val_pointers[wgId]){
-                int colIdx = Bcols[b];
-                localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
-                b += numBlocksPerWarp;
-            }
-
-            if(wiId < valsPerBlock){
-                localSum[wiId] += localSum[wiId + valsPerBlock];
-            }
-
-            b = wiId/valsPerBlock + val_pointers[wgId];
-
-            if(c == 0 && wiId < valsPerBlock){
-                for(unsigned int stride = 2; stride > 0; stride >>= 1){
-                    localSum[wiId] += localSum[wiId + stride];
-                }
-                z1[r] = localSum[wiId];
-            }
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim_wells){
-            temp = 0.0;
-            for(unsigned int i = 0; i < dim_wells; ++i){
-                temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
-            }
-            z2[wiId] = temp;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim*valSize){
-            temp = 0.0;
-            int bb = wiId/dim + val_pointers[wgId];
-            for (unsigned int j = 0; j < dim_wells; ++j){
-                temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
-            }
-            int colIdx = Ccols[bb];
-            y[colIdx*dim + c] -= temp;
-        }
-    }
-    )";
+    /// Generate string with the exact ilu decomposition kernel
+    /// The kernel takes a full BSR matrix and performs inplace ILU decomposition
+    std::string get_ilu_decomp_string();
 
 } // end namespace bda
 
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index 43ba050e2..e71f5c9c1 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -26,7 +26,6 @@
 #include <dune/common/timer.hh>
 
 #include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
-#include <opm/simulators/linalg/bda/openclKernels.hpp>
 
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
@@ -46,7 +45,6 @@ template <unsigned int block_size>
 openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
     prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
 
-    cl_int err = CL_SUCCESS;
     std::ostringstream out;
     try {
         std::vector<cl::Platform> platforms;
@@ -332,20 +330,23 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
     //applyblockedscaleadd(-1.0, mat, x, r);
 
     // set initial values
-    cl::Event event;
-    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
-    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
+    events.resize(5);
+    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
+    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
     rho = 1.0;
     alpha = 1.0;
     omega = 1.0;
 
-    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
-    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
-    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
+    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
+    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
+    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
+    }
 
     norm = norm_w(d_r, d_tmp);
     norm_0 = norm;
@@ -475,18 +476,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
     out.clear();
 
     try {
-        cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
-        source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
-        source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
-        source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
-        source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
-        source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
-        source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
-        source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
-        source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
-        program = cl::Program(*context, source);
-        program.build(devices);
-
         prec->setOpenCLContext(context.get());
         prec->setOpenCLQueue(queue.get());
 
@@ -518,27 +507,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
             d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
         }
 
-        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
-        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
+        get_opencl_kernels();
 
-        // actually creating the kernels
-        dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
-        norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
-        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
-        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
-        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
-        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
-        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
-        stdwell_apply_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                                  const unsigned int, const unsigned int, cl::Buffer&,
-                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
-        stdwell_apply_no_reorder_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                                  const unsigned int, const unsigned int, cl::Buffer&,
-                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
-
-        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
+        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
 
     } catch (const cl::Error& error) {
         std::ostringstream oss;
@@ -554,6 +525,59 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
     initialized = true;
 } // end initialize()
 
+void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
+        sources.emplace_back(std::make_pair(source.c_str(), source.size()));
+}
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::get_opencl_kernels() {
+
+        cl::Program::Sources sources;
+        std::string axpy_s = get_axpy_string();
+        add_kernel_string(sources, axpy_s);
+        std::string dot_1_s = get_dot_1_string();
+        add_kernel_string(sources, dot_1_s);
+        std::string norm_s = get_norm_string();
+        add_kernel_string(sources, norm_s);
+        std::string custom_s = get_custom_string();
+        add_kernel_string(sources, custom_s);
+        std::string spmv_blocked_s = get_spmv_blocked_string();
+        add_kernel_string(sources, spmv_blocked_s);
+#if CHOW_PATEL
+        bool ilu_operate_on_full_matrix = false;
+#else
+        bool ilu_operate_on_full_matrix = true;
+#endif
+        std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
+        add_kernel_string(sources, ILU_apply1_s);
+        std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
+        add_kernel_string(sources, ILU_apply2_s);
+        std::string stdwell_apply_s = get_stdwell_apply_string(true);
+        add_kernel_string(sources, stdwell_apply_s);
+        std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
+        add_kernel_string(sources, stdwell_apply_no_reorder_s);
+        std::string ilu_decomp_s = get_ilu_decomp_string();
+        add_kernel_string(sources, ilu_decomp_s);
+
+        cl::Program program = cl::Program(*context, sources);
+        program.build(devices);
+
+        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
+        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
+
+        // actually creating the kernels
+        dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
+        norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
+        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
+        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
+        spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
+        ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
+        ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
+        stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
+        stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
+        ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
+} // end get_opencl_kernels()
+
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
     if (opencl_ilu_reorder != ILUReorder::NONE) {
@@ -569,7 +593,7 @@ void openclSolverBackend<block_size>::finalize() {
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::copy_system_to_gpu() {
     Timer t;
-    cl::Event event;
+    events.resize(5);
 
 #if COPY_ROW_BY_ROW
     int sum = 0;
@@ -578,19 +602,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
         memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
         sum += size_row * block_size * block_size;
     }
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
 #else
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif
 
-    queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
-    queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
-    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
     if (opencl_ilu_reorder != ILUReorder::NONE) {
-        queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
+        events.resize(6);
+        queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]);
+    }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
     }
-    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
 
     if (verbosity > 2) {
         std::ostringstream out;
@@ -603,7 +633,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::update_system_on_gpu() {
     Timer t;
-    cl::Event event;
+    events.resize(3);
 
 #if COPY_ROW_BY_ROW
     int sum = 0;
@@ -612,14 +642,19 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
         memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
         sum += size_row * block_size * block_size;
     }
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
 #else
-    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif
 
-    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
-    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
-    event.wait();
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
+    }
 
     if (verbosity > 2) {
         std::ostringstream out;
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index aae527643..b57a8dfd6 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -21,6 +21,7 @@
 #define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
 
 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
@@ -64,22 +65,16 @@ private:
 
     // shared pointers are also passed to other objects
     std::vector<cl::Device> devices;
-    cl::Program program;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
-    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                    cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                    const unsigned int, const unsigned int, cl::Buffer&,
-                                    cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                    cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                    const unsigned int, const unsigned int, cl::Buffer&,
-                                    cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
+    std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
+    std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
+    std::shared_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
+    std::shared_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
+    std::shared_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
+    std::shared_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
 
     Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
@@ -87,6 +82,8 @@ private:
     std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
     BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix (or original if no reordering), used for spmv
     ILUReorder opencl_ilu_reorder;                                // reordering strategy
+    std::vector<cl::Event> events;
+    cl_int err;
 
     /// Divide A by B, and round up: return (int)ceil(A/B)
     /// \param[in] A    dividend
@@ -147,6 +144,9 @@ private:
     /// \param[in] cols           array of columnIndices, contains nnz values
     void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
 
+    /// Generate and compile opencl kernels
+    void get_opencl_kernels();
+
     /// Clean memory
     void finalize();