Exact ILU decomp is now performed on GPU

2025-02-25 18:55:30 -06:00 · 2021-02-23 14:15:47 +01:00
parent 748dc4ccb3
commit 141af23db5
8 changed files with 323 additions and 178 deletions
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -18,6 +18,7 @@
 */

 #include <config.h>
+
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/MatrixBlock.hpp>
@@ -32,14 +33,6 @@
 namespace bda
 {

-// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
-// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
-//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
-// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
-// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
-#define CHOW_PATEL     0
-#define CHOW_PATEL_GPU 1
-
    using Opm::OpmLog;
    using Dune::Timer;

@@ -126,41 +119,46 @@ namespace bda
        diagIndex = new int[mat->Nb];
        invDiagVals = new double[mat->Nb * bs * bs];

+#if CHOW_PATEL
        Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
        Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+#endif
+
+        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];

-        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
-        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
-        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
-        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs);
-        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
-        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1));
        s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
        s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+        s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
+#if CHOW_PATEL
+        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+#else
+        s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+        s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
+        s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
+#endif

-        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
-        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
-        queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
-        queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
-        queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
-        queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals);

-        int *rowsPerColorPrefix = new int[numColors + 1];
-        rowsPerColorPrefix[0] = 0;
+        rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
        for (int i = 0; i < numColors; ++i) {
            rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
        }
-        queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
-        delete[] rowsPerColorPrefix;
+        queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data());

        return true;
    } // end init()

+
    // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
    template <unsigned int block_size>
    void BILU0<block_size>::chow_patel_decomposition()
    {
+#if CHOW_PATEL
        const unsigned int bs = block_size;
        int num_sweeps = 6;

@@ -399,7 +397,7 @@ namespace bda
        // convert Ut to BSR
        // diagonal stored separately
        std::vector<int> ptr(Nb+1, 0);
-        std::vector<int> col(Ut->rowPointers[Nb]);
+        std::vector<int> cols(Ut->rowPointers[Nb]);

        // count blocks per row for U (BSR)
        // store diagonal in invDiagVals
@@ -423,7 +421,7 @@ namespace bda
                int j = Ut->colIndices[k];
                if (j != i) {
                    int head = ptr[j]++;
-                    col[head]  = i;
+                    cols[head]  = i;
                    memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
                }
            }
@@ -433,21 +431,39 @@ namespace bda
        std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
        ptr.front() = 0;

-        // reversing the rows of U, because that is the order they are used in during ILU apply
-        int URowIndex = 0;
-        int offsetU = 0;   // number of nnz blocks that are already copied to Umat
-        Umat->rowPointers[0] = 0;
-        for (int i = LUmat->Nb - 1; i >= 0; i--) {
-            int rowSize = ptr[i + 1] - ptr[i];   // number of blocks in this row
-            memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
-            memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
-            offsetU += rowSize;
-            Umat->rowPointers[URowIndex + 1] = offsetU;
-            URowIndex++;
-        }
+        std::call_once(pattern_uploaded, [&](){
+            // find the positions of each diagonal block
+            // must be done after reordering
+            for (int row = 0; row < Nb; ++row) {
+                int rowStart = LUmat->rowPointers[row];
+                int rowEnd = LUmat->rowPointers[row+1];
+                bool diagFound = false;
+                for (int ij = rowStart; ij < rowEnd; ++ij) {
+                    int col = LUmat->colIndices[ij];
+                    if (row == col) {
+                        diagIndex[row] = ij;
+                        diagFound = true;
+                        break;
+                    }
+                }
+                if (!diagFound) {
+                    OPM_THROW(std::logic_error, "Error did not find diagonal block in reordered matrix");
+                }
+            }
+            queue->enqueueWriteBuffer(s.diagIndex, CL_TRUE, 0, Nb * sizeof(int), diagIndex);
+            queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
+            queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
+            queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), cols.data());
+            queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data());
+        });
+
+
+        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues);
+        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp);
+        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals);

        delete[] Utmp;
-
+#endif // CHOW_PATEL
    }

    template <unsigned int block_size>
@@ -479,118 +495,66 @@ namespace bda
            OpmLog::info(out.str());
        }

-        Timer t_decomposition;
 #if CHOW_PATEL
        chow_patel_decomposition();
 #else
-        int i, j, ij, ik, jk;
-        int iRowStart, iRowEnd, jRowEnd;
-        double pivot[bs * bs];
+        Timer t_copyToGpu;

-        int LSize = 0;
-        Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks
+        queue->enqueueWriteBuffer(s.LUvals, CL_TRUE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues);

-        // go through all rows
-        for (i = 0; i < LUmat->Nb; i++) {
-            iRowStart = LUmat->rowPointers[i];
-            iRowEnd = LUmat->rowPointers[i + 1];
-
-            // go through all elements of the row
-            for (ij = iRowStart; ij < iRowEnd; ij++) {
-                j = LUmat->colIndices[ij];
-                // if the element is the diagonal, store the index and go to next row
-                if (j == i) {
-                    diagIndex[i] = ij;
-                    break;
-                }
-                // if an element beyond the diagonal is reach, no diagonal was found
-                // throw an error now. TODO: perform reordering earlier to prevent this
-                if (j > i) {
-                    std::ostringstream out;
-                    out << "BILU0 Error could not find diagonal value in row: " << i;
-                    OpmLog::error(out.str());
-                    return false;
-                }
-
-                LSize++;
-                // calculate the pivot of this row
-                blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
-
-                memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size);
-
-                jRowEnd = LUmat->rowPointers[j + 1];
-                jk = diagIndex[j] + 1;
-                ik = ij + 1;
-                // substract that row scaled by the pivot from this row.
-                while (ik < iRowEnd && jk < jRowEnd) {
-                    if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) {
-                        blockMultSub<bs>(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs);
-                        ik++;
-                        jk++;
-                    } else {
-                        if (LUmat->colIndices[ik] < LUmat->colIndices[jk])
-                        { ik++; }
-                        else
-                        { jk++; }
+        std::call_once(pattern_uploaded, [&](){
+            // find the positions of each diagonal block
+            // must be done after reordering
+            for (int row = 0; row < Nb; ++row) {
+                int rowStart = LUmat->rowPointers[row];
+                int rowEnd = LUmat->rowPointers[row+1];
+                bool diagFound = false;
+                for (int ij = rowStart; ij < rowEnd; ++ij) {
+                    int col = LUmat->colIndices[ij];
+                    if (row == col) {
+                        diagIndex[row] = ij;
+                        diagFound = true;
+                        break;
                    }
                }
+                if (!diagFound) {
+                    OPM_THROW(std::logic_error, "Error did not find diagonal block in reordered matrix");
+                }
            }
-            // store the inverse in the diagonal!
-            inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_TRUE, 0, Nb * sizeof(int), diagIndex);
+            queue->enqueueWriteBuffer(s.LUcols, CL_TRUE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices);
+            queue->enqueueWriteBuffer(s.LUrows, CL_TRUE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers);
+        });

-            memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
-        }
-
-        Lmat->rowPointers[0] = 0;
-        Umat->rowPointers[0] = 0;
-
-        // Split the LU matrix into two by comparing column indices to diagonal indices
-        for (i = 0; i < LUmat->Nb; i++) {
-            int offsetL = Lmat->rowPointers[i];
-            int rowSize = diagIndex[i] - LUmat->rowPointers[i];
-            int offsetLU = LUmat->rowPointers[i];
-            memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
-            memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
-            offsetL += rowSize;
-            Lmat->rowPointers[i + 1] = offsetL;
-        }
-        // Reverse the order or the (blocked) rows for the U matrix,
-        // because the rows are accessed in reverse order when applying the ILU0
-        int URowIndex = 0;
-        for (i = LUmat->Nb - 1; i >= 0; i--) {
-            int offsetU = Umat->rowPointers[URowIndex];
-            int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1;
-            int offsetLU = diagIndex[i] + 1;
-            memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
-            memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
-            offsetU += rowSize;
-            Umat->rowPointers[URowIndex + 1] = offsetU;
-            URowIndex++;
-        }
-#endif
-        if (verbosity >= 3) {
-            std::ostringstream out;
-            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        Timer t_copyToGpu;
-        if (pattern_uploaded == false) {
-            queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
-            queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
-            queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
-            queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
-            pattern_uploaded = true;
-        }
-        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
-        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
-        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
        if (verbosity >= 3) {
            std::ostringstream out;
            out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
            OpmLog::info(out.str());
        }

+        Timer t_decomposition;
+        std::ostringstream out;
+        for (int color = 0; color < numColors; ++color) {
+            const unsigned int firstRow = rowsPerColorPrefix[color];
+            const unsigned int lastRow = rowsPerColorPrefix[color+1];
+            const unsigned int work_group_size2 = 128;
+            const unsigned int num_work_groups2 = 1024;
+            const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
+            const unsigned int num_hwarps_per_group = work_group_size2 / 16;
+            const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double);           // each block needs a pivot
+            if (verbosity >= 4) {
+                out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+            }
+            cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
+            event.wait();
+        }
+
+        if (verbosity >= 3) {
+            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+            OpmLog::info(out.str());
+        }
+#endif // CHOW_PATEL
+
        return true;
    } // end create_preconditioner()

@@ -604,16 +568,24 @@ namespace bda
        Timer t_apply;

        for(int color = 0; color < numColors; ++color){
-            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#if CHOW_PATEL
+            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#else
+            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#endif
            // event.wait();
        }

        for(int color = numColors-1; color >= 0; --color){
-            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#if CHOW_PATEL
+            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#else
+            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+#endif
            // event.wait();
        }

-        if (verbosity >= 3) {
+        if (verbosity >= 4) {
            event.wait();
            std::ostringstream out;
            out << "BILU0 apply: " << t_apply.stop() << " s";
@@ -638,11 +610,13 @@ namespace bda
    }
    template <unsigned int block_size>
    void BILU0<block_size>::setKernels(
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
+        cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
    ){
        this->ILU_apply1 = ILU_apply1_;
        this->ILU_apply2 = ILU_apply2_;
+        this->ilu_decomp_k = ilu_decomp_k_;
    }


@@ -657,8 +631,9 @@ template void BILU0<n>::setOpenCLContext(cl::Context*);
 template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);                               \
 template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int);   \
 template void BILU0<n>::setKernels(                                                      \
-    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
-    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *  \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
+    cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *                 \
    );

 INSTANTIATE_BDA_FUNCTIONS(1);
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -20,12 +20,26 @@
 #ifndef BILU0_HPP
 #define BILU0_HPP

+#include <mutex>
+
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>

 #include <opm/simulators/linalg/bda/opencl.hpp>
 #include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>

+// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
+// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
+//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
+// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
+// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
+// the exact decomposition uses a full matrix LU which is the superposition of L and U
+// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
+#define CHOW_PATEL     0
+#define CHOW_PATEL_GPU 1
+
+
 namespace bda
 {

@@ -40,33 +54,46 @@ namespace bda
        int Nb;      // number of blockrows of the matrix
        int nnz;     // number of nonzeroes of the matrix (scalar)
        int nnzbs;   // number of blocks of the matrix
-        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr, LUmat = nullptr;
+        std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
        std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
+#if CHOW_PATEL
+        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
+#endif
        double *invDiagVals = nullptr;
        int *diagIndex = nullptr;
-        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
+        std::vector<int> rowsPerColor;        // color i contains rowsPerColor[i] rows, which are processed in parallel
+        std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
        int *toOrder = nullptr, *fromOrder = nullptr;
        int numColors;
        int verbosity;
+        std::once_flag pattern_uploaded;

        ILUReorder opencl_ilu_reorder;

        typedef struct {
-            cl::Buffer Lvals, Uvals, invDiagVals;
-            cl::Buffer Lcols, Lrows;
-            cl::Buffer Ucols, Urows;
+            cl::Buffer invDiagVals;
+            cl::Buffer diagIndex;
            cl::Buffer rowsPerColor;
+#if CHOW_PATEL
+            cl::Buffer Lvals, Lcols, Lrows;
+            cl::Buffer Uvals, Ucols, Urows;
+#else
+            cl::Buffer LUvals, LUcols, LUrows;
+#endif
        } GPU_storage;

-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
+        cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&,
+                                        const int, cl::LocalSpaceArg> *ilu_decomp_k;
+
        GPU_storage s;
        cl::Context *context;
        cl::CommandQueue *queue;
        int work_group_size = 0;
        int total_work_items = 0;
        int lmem_per_work_group = 0;
-        bool pattern_uploaded = false;

        ChowPatelIlu chowPatelIlu;

@@ -91,8 +118,9 @@ namespace bda
        void setOpenCLQueue(cl::CommandQueue *queue);
        void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
        void setKernels(
-            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
-            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
+            cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k
            );

        int* getToOrder()
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@@ -201,11 +201,10 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,

 void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
    int reordered = 0;
-    int i, c;

    // Find reordering patterns
-    for (c = 0; c < numColors; c++) {
-        for (i = 0; i < Nb; i++) {
+    for (int c = 0; c < numColors; c++) {
+        for (int i = 0; i < Nb; i++) {
            if (colors[i] == c) {
                rowsPerColor[c]++;
                toOrder[i] = reordered;
@@ -259,10 +258,11 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
    int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0;
    int thisRow;
    std::vector<bool> doneRows(Nb, false);
-    rowsPerColor.reserve(Nb);
-
    std::vector <int> rowsToStart;

+    // since emplace_back() is used to fill, the vector must be empty
+    assert(rowsPerColor.size() == 0);
+
    // find starting rows: rows that are independent from all rows that come before them.
    for (thisRow = 0; thisRow < Nb; thisRow++) {
        if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) {
@@ -312,8 +312,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd

 template <unsigned int block_size>
 void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
-    std::vector<int> rowColor;
-    rowColor.resize(Nb);
+    std::vector<int> rowColor(Nb);

    *numColors = colorBlockedNodes<block_size>(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor);

--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@@ -90,7 +90,7 @@ bool canBeStarted(const int rowIndex, const  int *rowPointers, const  int *colIn
 /// \param[out] numColors       a pointer to the number of colors needed for the level scheduling
 /// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
 /// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color
+/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
 void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);

 /// Find a graph coloring reordering for an input matrix
@@ -105,7 +105,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
 /// \param[out] numColors       the number of colors used in the found graph coloring
 /// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
 /// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color
+/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color, this function will resize()
 template <unsigned int block_size>
 void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor);

--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -22,7 +22,7 @@
 #include <cstring>
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

 namespace Opm
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -225,7 +225,7 @@ namespace bda
        __global const double *Lvals,
        __global const unsigned int *Lcols,
        __global const unsigned int *Lrows,
-        const unsigned int LrowSize,
+        __global const int *diagIndex,
        __global const double *y,
        __global double *x,
        __global const unsigned int *nodesPerColorPrefix,
@@ -250,7 +250,7 @@ namespace bda

        while(target_block_row < nodesPerColorPrefix[color+1]){
            const unsigned int first_block = Lrows[target_block_row];
-            const unsigned int last_block = Lrows[target_block_row+1];
+            const unsigned int last_block = diagIndex[target_block_row];
            unsigned int block = first_block + lane / (bs*bs);
            double local_out = 0.0;
            if(lane < num_active_threads){
@@ -292,10 +292,10 @@ namespace bda
    // solves U*x=y where L is a lower triangular sparse blocked matrix
    inline const char* ILU_apply2_s = R"(
    __kernel void ILU_apply2(
-        __global const double *Uvals,
-        __global const int *Ucols,
-        __global const int *Urows,
-        const unsigned int UrowSize,
+        __global const double *LUvals,
+        __global const int *LUcols,
+        __global const int *LUrows,
+        __global const int *diagIndex,
        __global const double *invDiagVals,
        __global double *x,
        __global const unsigned int *nodesPerColorPrefix,
@@ -320,8 +320,8 @@ namespace bda
        const double relaxation = 0.9;

        while(target_block_row < nodesPerColorPrefix[color+1]){
-            const unsigned int first_block = Urows[UrowSize-target_block_row-1];
-            const unsigned int last_block = Urows[UrowSize-target_block_row];
+            const unsigned int first_block = diagIndex[target_block_row] + 1;
+            const unsigned int last_block = LUrows[target_block_row+1];
            unsigned int block = first_block + lane / (bs*bs);
            double local_out = 0.0;
            if(lane < num_active_threads){
@@ -330,8 +330,8 @@ namespace bda
                    local_out = x[row];
                }
                for(; block < last_block; block += num_blocks_per_warp){
-                    const double x_elem = x[Ucols[block]*bs + c];
-                    const double A_elem = Uvals[block*bs*bs + c + r*bs];
+                    const double x_elem = x[LUcols[block]*bs + c];
+                    const double A_elem = LUvals[block*bs*bs + c + r*bs];
                    local_out -= x_elem * A_elem;
                }
            }
@@ -513,6 +513,144 @@ namespace bda
    }
    )";

+    inline const char* ilu_decomp_s = R"(
+
+    // a = a - (b * c)
+    __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
+    {
+        const unsigned int block_size = 3;
+        const unsigned int hwarp_size = 16;
+        const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+        const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+        if(thread_id_in_hwarp < block_size * block_size){
+            const unsigned int row = thread_id_in_hwarp / block_size;
+            const unsigned int col = thread_id_in_hwarp % block_size;
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += b[block_size * row + k] * c[block_size * k + col];
+            }
+            a[block_size * row + col] -= temp;
+        }
+    }
+
+    // c = a * b
+    __kernel void block_mult(__global double *a, __global double *b, __local double *c) {
+        const unsigned int block_size = 3;
+        const unsigned int hwarp_size = 16;
+        const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+        const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+        if(thread_id_in_hwarp < block_size * block_size){
+            const unsigned int row = thread_id_in_hwarp / block_size;
+            const unsigned int col = thread_id_in_hwarp % block_size;
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += a[block_size * row + k] * b[block_size * k + col];
+            }
+            c[block_size * row + col] = temp;
+        }
+    }
+
+    // invert 3x3 matrix
+    __kernel void inverter(__global double *matrix, __global double *inverse) {
+        const unsigned int block_size = 3;
+        const unsigned int bs = block_size;                           // rename to shorter name
+        const unsigned int hwarp_size = 16;
+        const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+        const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+        if(thread_id_in_hwarp < bs * bs){
+            double t4  = matrix[0] * matrix[4];
+            double t6  = matrix[0] * matrix[5];
+            double t8  = matrix[1] * matrix[3];
+            double t10 = matrix[2] * matrix[3];
+            double t12 = matrix[1] * matrix[6];
+            double t14 = matrix[2] * matrix[6];
+
+            double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                          t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+            double t17 = 1.0 / det;
+
+            const unsigned int r = thread_id_in_hwarp / bs;
+            const unsigned int c = thread_id_in_hwarp % bs;
+            const unsigned int r1 = (r+1) % bs;
+            const unsigned int c1 = (c+1) % bs;
+            const unsigned int r2 = (r+bs-1) % bs;
+            const unsigned int c2 = (c+bs-1) % bs;
+            inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
+        }
+    }
+
+    __kernel void ilu_decomp(const unsigned int firstRow,
+                             const unsigned int lastRow,
+                             __global double *LUvals,
+                             __global const int *LUcols,
+                             __global const int *LUrows,
+                             __global double *invDiagVals,
+                             __global int *diagIndex,
+                             const unsigned int Nb,
+                             __local double *pivot){
+
+        const unsigned int bs = 3;
+        const unsigned int hwarp_size = 16;
+        const unsigned int work_group_size = get_local_size(0);
+        const unsigned int work_group_id = get_group_id(0);
+        const unsigned int num_groups = get_num_groups(0);
+        const unsigned int hwarps_per_group = work_group_size / hwarp_size;
+        const unsigned int thread_id_in_group = get_local_id(0);      // thread id in work group
+        const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size;     // thread id in hwarp (16 threads)
+        const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
+        const unsigned int lmem_offset = hwarp_id_in_group * bs * bs;  // each workgroup gets some lmem, but the workitems have to share it
+                                                                       // every workitem in a hwarp has the same lmem_offset
+
+        // go through all rows
+        for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
+        {
+            int iRowStart = LUrows[i];
+            int iRowEnd = LUrows[i + 1];
+
+            // go through all elements of the row
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUcols[ij];
+
+                if (j < i) {
+                    // calculate the pivot of this row
+                    block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
+
+                    // copy pivot
+                    if (thread_id_in_hwarp < bs * bs) {
+                        LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
+                    }
+
+                    int jRowEnd = LUrows[j + 1];
+                    int jk = diagIndex[j] + 1;
+                    int ik = ij + 1;
+                    // substract that row scaled by the pivot from this row.
+                    while (ik < iRowEnd && jk < jRowEnd) {
+                        if (LUcols[ik] == LUcols[jk]) {
+                            block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
+                            ik++;
+                            jk++;
+                        } else {
+                            if (LUcols[ik] < LUcols[jk])
+                            { ik++; }
+                            else
+                            { jk++; }
+                        }
+                    }
+                }
+            }
+
+            // store the inverse in the diagonal
+            inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
+
+            // copy inverse
+            if (thread_id_in_hwarp < bs * bs) {
+                LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
+            }
+
+        }
+    }
+    )";
+
 } // end namespace bda

 #endif
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -484,6 +484,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
        source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
        source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
+        source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
        program = cl::Program(*context, source);
        program.build(devices);

@@ -527,8 +528,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
-        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
-        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
+        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
+        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
        stdwell_apply_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                  const unsigned int, const unsigned int, cl::Buffer&,
@@ -537,8 +538,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                  const unsigned int, const unsigned int, cl::Buffer&,
                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
+        ilu_decomp_k.reset(new cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                               const int, cl::LocalSpaceArg>(cl::Kernel(program, "ilu_decomp")));

-        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
+        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());

    } catch (const cl::Error& error) {
        std::ostringstream oss;
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -70,8 +70,8 @@ private:
    std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                    cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                    const unsigned int, const unsigned int, cl::Buffer&,
@@ -80,6 +80,8 @@ private:
                                    cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                    const unsigned int, const unsigned int, cl::Buffer&,
                                    cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
+    std::shared_ptr<cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    const int, cl::LocalSpaceArg> > ilu_decomp_k;

    Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings