Added CPU and GPU implementations of Fine-Grained Parallel ILU (FGPILU)

2025-02-25 18:55:30 -06:00 · 2020-12-17 14:49:59 +01:00 · 2020-12-17 14:49:59 +01:00 · 9f92a69037
commit 9f92a69037
parent 02fc2b9c53
10 changed files with 949 additions and 4 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -58,6 +58,7 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/fgpilu.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
@ -168,6 +169,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/BlockedMatrix.hpp
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+  opm/simulators/linalg/bda/fgpilu.hpp
  opm/simulators/linalg/bda/Reorder.hpp
  opm/simulators/linalg/bda/ILUReorder.hpp
  opm/simulators/linalg/bda/opencl.hpp
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -25,12 +25,21 @@

 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/fgpilu.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>


 namespace bda
 {

+// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
+// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
+//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
+// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
+#define CHOW_PATEL     1
+#define CHOW_PATEL_GPU 1
+
    using Opm::OpmLog;
    using Dune::Timer;

@ -82,11 +91,26 @@ namespace bda
        } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
            out << "BILU0 reordering strategy: " << "graph_coloring\n";
            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
+            out << "BILU0 reordering strategy: none\n";
+            numColors = 1;
+            rowsPerColor.emplace_back(Nb);
+            // numColors = Nb;
+            // for(int i = 0; i < Nb; ++i){
+            //     rowsPerColor.emplace_back(1);
+            // }
+            for(int i = 0; i < Nb; ++i){
+                toOrder[i] = i;
+                fromOrder[i] = i;
+            }
        } else {
            OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
        }
        if(verbosity >= 3){
-            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
+            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+        }
+        if(verbosity >= 2){
+            out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
        }
        OpmLog::info(out.str());

@ -129,6 +153,233 @@ namespace bda
        return true;
    } // end init()

+    // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
+    template <unsigned int block_size>
+    void BILU0<block_size>::chow_patel_decomposition()
+    {
+        const unsigned int bs = block_size;
+        int num_sweeps = 6;
+
+        // split matrix into L and U
+        // also convert U into BSC format (Ut)
+        // Ut stores diagonal for now
+        int num_blocks_L = 0;
+
+        // Ut is actually BSC format
+        std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Umat->Nb, Umat->nnzbs + Umat->Nb);
+
+        Lmat->rowPointers[0] = 0;
+        for (int i = 0; i < Nb+1; i++) {
+            Ut->rowPointers[i] = 0;
+        }
+
+        // for every row
+        for (int i = 0; i < Nb; i++) {
+            int iRowStart = LUmat->rowPointers[i];
+            int iRowEnd = LUmat->rowPointers[i + 1];
+            // for every block in this row
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUmat->colIndices[ij];
+                if (i <= j) {
+                    Ut->rowPointers[j+1]++;   // actually colPointers
+                } else {
+                    Lmat->colIndices[num_blocks_L] = j;
+                    memcpy(Lmat->nnzValues + num_blocks_L * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
+                    num_blocks_L++;
+                }
+            }
+            Lmat->rowPointers[i+1] = num_blocks_L;
+        }
+
+        // prefix sum
+        int sum = 0;
+        for (int i = 1; i < Nb+1; i++) {
+            sum += Ut->rowPointers[i];
+            Ut->rowPointers[i] = sum;
+        }
+
+        // for every row
+        for (int i = 0; i < Nb; i++) {
+            int iRowStart = LUmat->rowPointers[i];
+            int iRowEnd = LUmat->rowPointers[i + 1];
+            // for every block in this row
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUmat->colIndices[ij];
+                if (i <= j){
+                    int idx = Ut->rowPointers[j]++;
+                    Ut->colIndices[idx] = i;     // actually rowIndices
+                    memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
+                }
+            }
+        }
+
+        // rotate
+        // the Ut->rowPointers were increased in the last loop
+        for (int i = Nb; i > 0; --i) {
+            Ut->rowPointers[i] = Ut->rowPointers[i-1];
+        }
+        Ut->rowPointers[0] = 0;
+
+        Opm::Detail::Inverter<bs> inverter;
+
+        // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed at the end
+        // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
+        double *Utmp = new double[Ut->nnzbs * block_size * block_size];
+
+        // actual ILU decomposition
+#if CHOW_PATEL_GPU
+        fgpilu.decomposition(queue, context,
+                    Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
+                    Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
+                    LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
+                    Nb, num_sweeps, verbosity);
+#else
+        double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
+        for (int sweep = 0; sweep < num_sweeps; ++sweep) {
+
+            // for every row
+            for (int row = 0; row < Nb; row++) {
+                // update U
+                int jColStart = Ut->rowPointers[row];
+                int jColEnd = Ut->rowPointers[row + 1];
+                // for every block in this row
+                for (int ij = jColStart; ij < jColEnd; ij++) {
+                    int col = Ut->colIndices[ij];
+                    // refine Uij element (or diagonal)
+                    int i1 = LUmat->rowPointers[col];
+                    int i2 = LUmat->rowPointers[col+1];
+                    int kk = 0;
+                    for(kk = i1; kk < i2; ++kk) {
+                        ptrdiff_t c = LUmat->colIndices[kk];
+                        if (c >= row) {
+                            break;
+                        }
+                    }
+                    double aij[bs*bs];
+                    memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
+                    int jk = Lmat->rowPointers[col];
+                    int ik = (jk < Lmat->rowPointers[col+1]) ? Lmat->colIndices[jk] : Nb;
+
+                    for (int k = jColStart; k < ij; ++k) {
+                        int ki = Ut->colIndices[k];
+                        while (ik < ki) {
+                            ++jk;
+                            ik = Lmat->colIndices[jk];
+                        }
+                        if (ik == ki) {
+                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
+                        }
+                    }
+
+                    memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
+                }
+
+                // update L
+                int iRowStart = Lmat->rowPointers[row];
+                int iRowEnd = Lmat->rowPointers[row + 1];
+
+                for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                    int j = Lmat->colIndices[ij];
+                    // refine Lij element
+                    int i1 = LUmat->rowPointers[row];
+                    int i2 = LUmat->rowPointers[row+1];
+                    int kk = 0;
+                    for(kk = i1; kk < i2; ++kk) {
+                        ptrdiff_t c = LUmat->colIndices[kk];
+                        if (c >= j) {
+                            break;
+                        }
+                    }
+                    double aij[bs*bs];
+                    memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
+                    int jk = Ut->rowPointers[j];
+                    int ik = Ut->colIndices[jk];
+                    for (int k = iRowStart; k < ij; ++k) {
+                        int ki = Lmat->colIndices[k];
+                        while(ik < ki) {
+                            ++jk;
+                            ik = Ut->colIndices[jk];
+                        }
+
+                        if(ik == ki) {
+                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
+                        }
+                    }
+                    // calculate aij / ujj
+                    double ujj[bs*bs];
+                    inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
+                    // lij = aij / ujj
+                    blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
+
+                }
+            }
+            double *t = Lmat->nnzValues;
+            Lmat->nnzValues = Ltmp;
+            Ltmp = t;
+            t = Ut->nnzValues;
+            Ut->nnzValues = Utmp;
+            Utmp = t;
+        } // end sweep
+        delete[] Ltmp;
+#endif
+
+        // convert Ut to BSR
+        // diagonal stored separately
+        std::vector<int> ptr(Nb+1, 0);
+        std::vector<int> col(Ut->rowPointers[Nb]);
+
+        // count blocks per row for U (BSR)
+        // store diagonal in invDiagVals
+        for(int i = 0; i < Nb; ++i) {
+            for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
+                int j = Ut->colIndices[k];
+                if (j == i) {
+                    inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
+                } else {
+                    ++ptr[j+1];
+                }
+            }
+        }
+
+        // prefix sum
+        int sumU = 0;
+        for (int i = 1; i < Nb+1; i++) {
+            sumU += ptr[i];
+            ptr[i] = sumU;
+        }
+
+        // actually copy nonzero values for U
+        for(int i = 0; i < Nb; ++i) {
+            for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
+                int j = Ut->colIndices[k];
+                if (j != i) {
+                    int head = ptr[j]++;
+                    col[head]  = i;
+                    memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
+                }
+            }
+        }
+
+        // the ptr[] were increased in the last loop
+        std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
+        ptr.front() = 0;
+
+        // reversing the rows of U, because that is the order they are used in
+        int URowIndex = 0;
+        int offsetU = 0;   // number of nnz blocks that are already copied to Umat
+        Umat->rowPointers[0] = 0;
+        for (int i = LUmat->Nb - 1; i >= 0; i--) {
+            int rowSize = ptr[i + 1] - ptr[i];   // number of blocks in this row
+            memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
+            memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
+            offsetU += rowSize;
+            Umat->rowPointers[URowIndex + 1] = offsetU;
+            URowIndex++;
+        }
+
+        delete[] Utmp;
+
+    }

    template <unsigned int block_size>
    bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
@ -153,6 +404,10 @@ namespace bda
            OpmLog::info(out.str());
        }

+        Timer t_decomposition;
+#if CHOW_PATEL
+        chow_patel_decomposition();
+#else
        int i, j, ij, ik, jk;
        int iRowStart, iRowEnd, jRowEnd;
        double pivot[bs * bs];
@ -160,8 +415,6 @@ namespace bda
        int LSize = 0;
        Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks

-        Timer t_decomposition;
-
        // go through all rows
        for (i = 0; i < LUmat->Nb; i++) {
            iRowStart = LUmat->rowPointers[i];
@ -239,6 +492,7 @@ namespace bda
            Umat->rowPointers[URowIndex + 1] = offsetU;
            URowIndex++;
        }
+#endif
        if (verbosity >= 3) {
            std::ostringstream out;
            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
@ -278,6 +532,7 @@ namespace bda
            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
            // event.wait();
        }
+
        for(int color = numColors-1; color >= 0; --color){
            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
            // event.wait();
@ -320,6 +575,7 @@ namespace bda
 template BILU0<n>::BILU0(ILUReorder, int);                                               \
 template BILU0<n>::~BILU0();                                                             \
 template bool BILU0<n>::init(BlockedMatrix<n>*);                                         \
+template void BILU0<n>::chow_patel_decomposition();                                      \
 template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*);                        \
 template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y);                             \
 template void BILU0<n>::setOpenCLContext(cl::Context*);                                  \
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>

 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/fgpilu.hpp>

 namespace bda
 {
@ -67,6 +68,10 @@ namespace bda
        int lmem_per_work_group = 0;
        bool pattern_uploaded = false;

+        FGPILU fgpilu;
+
+        void chow_patel_decomposition();
+
    public:

        BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -58,6 +58,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
        } else if (opencl_ilu_reorder == "graph_coloring") {
            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
+        } else if (opencl_ilu_reorder == "none") {
+            ilu_reorder = bda::ILUReorder::NONE;
        } else {
            OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
        }
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -93,11 +93,23 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
    }
 }

+// subtract c from b and store in a
+// a = b - c
+template <unsigned int block_size>
+void blockSub(double *a, double *b, double *c)
+{
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
+        }
+    }
+}

 #define INSTANTIATE_BDA_FUNCTIONS(n)                                 \
 template void sortBlockedRow<n>(int *, double *, int, int);          \
 template void blockMultSub<n>(double *, double *, double *);         \
 template void blockMult<n>(double *, double *, double *);            \
+template void blockSub<n>(double *, double *, double *);             \

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -109,6 +109,14 @@ void blockMultSub(double *a, double *b, double *c);
 template <unsigned int block_size>
 void blockMult(double *mat1, double *mat2, double *resMat);

+/// Subtract blocks
+/// a = b - c
+/// \param[out] a                result block
+/// \param[in] b                 input block
+/// \param[in] c                 input block
+template <unsigned int block_size>
+void blockSub(double *a, double *b, double *c);
+
 } // end namespace bda

 #endif
--- a/opm/simulators/linalg/bda/ILUReorder.hpp
+++ b/opm/simulators/linalg/bda/ILUReorder.hpp
@ -27,7 +27,8 @@ namespace bda
        
    enum class ILUReorder {
        LEVEL_SCHEDULING,
-        GRAPH_COLORING
+        GRAPH_COLORING,
+        NONE
    };

 }
--- a/opm/simulators/linalg/bda/fgpilu.cpp
+++ b/opm/simulators/linalg/bda/fgpilu.cpp
@ -0,0 +1,570 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/fgpilu.hpp>
+
+namespace bda
+{
+
+    using Opm::OpmLog;
+
+// if PARALLEL is 0:
+//    Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
+//    Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
+//    except when the number of blocks in that row exceeds the number of workitems per workgroup.
+//    In that case some workitems will process multiple blocks sequentially.
+// else:
+//    Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially
+//    Each block in a row gets a warp of 32 workitems, of which 9 are always active.
+//    Multiple blocks can be processed in parallel if a workgroup contains multiple warps.
+//    If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
+
+// Notes:
+// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
+// PARALLEL 1 should be run with at least 32 workitems per workgroup.
+// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
+// PARALLEL 0 is generally faster, despite not having parallelization.
+
+#define PARALLEL 0
+
+#if PARALLEL
+
+inline const char* fgpilu_sweep_s  = R"(
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+void blockMultSub(
+    __local double * restrict a,
+    __global const double * restrict b,
+    __global const double * restrict c)
+{
+    const unsigned int block_size = 3;
+    const unsigned int warp_size = 32;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    if(thread_id_in_warp < block_size * block_size){
+        const unsigned int row = thread_id_in_warp / block_size;
+        const unsigned int col = thread_id_in_warp % block_size;
+        double temp = 0.0;
+        for (unsigned int k = 0; k < block_size; k++) {
+            temp += b[block_size * row + k] * c[block_size * k + col];
+        }
+        a[block_size * row + col] -= temp;
+    }
+}
+
+
+void blockMult(
+    __local const double * restrict mat1,
+    __local const double * restrict mat2,
+    __global double * restrict resMat)
+{
+    const unsigned int block_size = 3;
+    const unsigned int warp_size = 32;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    if(thread_id_in_warp < block_size * block_size){
+        const unsigned int row = thread_id_in_warp / block_size;
+        const unsigned int col = thread_id_in_warp % block_size;
+        double temp = 0.0;
+        for (unsigned int k = 0; k < block_size; k++) {
+            temp += mat1[block_size * row + k] * mat2[block_size * k + col];
+        }
+        resMat[block_size * row + col] = temp;
+    }
+}
+
+
+void invert(
+    __global const double * restrict matrix,
+    __local double * restrict inverse)
+{
+    const unsigned int block_size = 3;
+    const unsigned int bs = block_size;
+    const unsigned int warp_size = 32;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    if(thread_id_in_warp < block_size * block_size){
+        // code generated by maple, copied from Dune::DenseMatrix
+        double t4  = matrix[0] * matrix[4];
+        double t6  = matrix[0] * matrix[5];
+        double t8  = matrix[1] * matrix[3];
+        double t10 = matrix[2] * matrix[3];
+        double t12 = matrix[1] * matrix[6];
+        double t14 = matrix[2] * matrix[6];
+
+        double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                      t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+        double t17 = 1.0 / det;
+
+        const unsigned int r = thread_id_in_warp / block_size;
+        const unsigned int c = thread_id_in_warp % block_size;
+        const unsigned int r1 = (r+1) % bs;
+        const unsigned int c1 = (c+1) % bs;
+        const unsigned int r2 = (r+bs-1) % bs;
+        const unsigned int c2 = (c+bs-1) % bs;
+        inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
+    }
+}
+
+__kernel void fgpilu_sweep(
+    __global const double * restrict Ut_vals,
+    __global const double * restrict L_vals,
+    __global const double * restrict LU_vals,
+    __global const int * restrict Ut_rows,
+    __global const int * restrict L_rows,
+    __global const int * restrict LU_rows,
+    __global const int * restrict Ut_cols,
+    __global const int * restrict L_cols,
+    __global const int * restrict LU_cols,
+    __global double * restrict Ltmp,
+    __global double * restrict Utmp,
+    const int Nb,
+    __local double *aij,
+    __local double *ujj)
+{
+    const int bs = 3;
+
+    // for every row
+    const unsigned int warp_size = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int num_groups = get_num_groups(0);
+    const unsigned int warps_per_group = bsize / warp_size;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    const unsigned int warp_id_in_group = idx_t / warp_size;
+    const unsigned int lmem_offset = warp_id_in_group * bs * bs;            // each workgroup gets some lmem, but the workitems have to share it
+    for (int row = idx_b; row < Nb; row+=num_groups) {
+        int jColStart = Ut_rows[row];
+        int jColEnd = Ut_rows[row + 1];
+        for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) {
+            int col = Ut_cols[ij];
+            // refine Uij element (or diagonal)
+            int i1 = LU_rows[col];
+            int i2 = LU_rows[col+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= row) {
+                    break;
+                }
+            }
+
+            if(thread_id_in_warp < bs*bs){
+                aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
+            }
+
+            int jk = L_rows[col];
+            int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
+
+            for (int k = jColStart; k < ij; ++k) {
+                int ki = Ut_cols[k];
+                while (ik < ki) {
+                    ++jk;
+                    ik = L_cols[jk];
+                }
+                if (ik == ki) {
+                    blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
+                }
+            }
+
+            if(thread_id_in_warp < bs*bs){
+                Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp];
+            }
+        }
+
+        // update L
+        int iRowStart = L_rows[row];
+        int iRowEnd = L_rows[row + 1];
+
+        for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) {
+        // for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
+            int j = L_cols[ij];
+            // // refine Lij element
+            int i1 = LU_rows[row];
+            int i2 = LU_rows[row+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= j) {
+                    break;
+                }
+            }
+
+            if(thread_id_in_warp < bs*bs){
+                aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
+            }
+
+            int jk = Ut_rows[j];
+            int ik = Ut_cols[jk];
+            for (int k = iRowStart; k < ij; ++k) {
+                int ki = L_cols[k];
+                while(ik < ki) {
+                    ++jk;
+                    ik = Ut_cols[jk];
+                }
+
+                if(ik == ki) {
+                    blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
+                }
+            }
+
+            // calculate aij / ujj
+            invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset);
+
+            // lij = aij / ujj
+            blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs);
+        }
+    }
+}
+)";
+
+#else
+
+inline const char* fgpilu_sweep_s  = R"(
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+void blockMultSub(
+    __local double * restrict a,
+    __global const double * restrict b,
+    __global const double * restrict c)
+{
+    const unsigned int block_size = 3;
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += b[block_size * row + k] * c[block_size * k + col];
+            }
+            a[block_size * row + col] -= temp;
+        }
+    }
+}
+
+
+void blockMult(
+    __local const double * restrict mat1,
+    __local const double * restrict mat2,
+    __global double * restrict resMat)
+{
+    const unsigned int block_size = 3;
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += mat1[block_size * row + k] * mat2[block_size * k + col];
+            }
+            resMat[block_size * row + col] = temp;
+        }
+    }
+}
+
+
+__kernel void inverter(
+    __global const double * restrict matrix,
+    __local double * restrict inverse)
+{
+    // code generated by maple, copied from Dune::DenseMatrix
+    double t4  = matrix[0] * matrix[4];
+    double t6  = matrix[0] * matrix[5];
+    double t8  = matrix[1] * matrix[3];
+    double t10 = matrix[2] * matrix[3];
+    double t12 = matrix[1] * matrix[6];
+    double t14 = matrix[2] * matrix[6];
+
+    double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                  t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+    double t17 = 1.0 / det;
+
+    inverse[0] =  (matrix[4] * matrix[8] - matrix[5] * matrix[7]) * t17;
+    inverse[1] = -(matrix[1] * matrix[8] - matrix[2] * matrix[7]) * t17;
+    inverse[2] =  (matrix[1] * matrix[5] - matrix[2] * matrix[4]) * t17;
+    inverse[3] = -(matrix[3] * matrix[8] - matrix[5] * matrix[6]) * t17;
+    inverse[4] =  (matrix[0] * matrix[8] - t14) * t17;
+    inverse[5] = -(t6 - t10) * t17;
+    inverse[6] =  (matrix[3] * matrix[7] - matrix[4] * matrix[6]) * t17;
+    inverse[7] = -(matrix[0] * matrix[7] - t12) * t17;
+    inverse[8] =  (t4 - t8) * t17;
+}
+
+__kernel void fgpilu_sweep(
+    __global const double * restrict Ut_vals,
+    __global const double * restrict L_vals,
+    __global const double * restrict LU_vals,
+    __global const int * restrict Ut_rows,
+    __global const int * restrict L_rows,
+    __global const int * restrict LU_rows,
+    __global const int * restrict Ut_cols,
+    __global const int * restrict L_cols,
+    __global const int * restrict LU_cols,
+    __global double * restrict Ltmp,
+    __global double * restrict Utmp,
+    const int Nb,
+    __local double *aij,
+    __local double *ujj)
+{
+    const int bs = 3;
+
+    // for every row
+    const unsigned int warp_size = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int num_groups = get_num_groups(0);
+    const unsigned int warps_per_group = bsize / warp_size;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    const unsigned int warp_id_in_group = idx_t / warp_size;
+    for (int row = idx_b; row < Nb; row+=num_groups) {
+        int jColStart = Ut_rows[row];
+        int jColEnd = Ut_rows[row + 1];
+        for (int ij = jColStart + idx_t; ij < jColEnd; ij+=bsize) {
+            int col = Ut_cols[ij];
+            // refine Uij element (or diagonal)
+            int i1 = LU_rows[col];
+            int i2 = LU_rows[col+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= row) {
+                    break;
+                }
+            }
+
+            for(int z = 0; z < bs*bs; ++z){
+                aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
+            }
+
+            int jk = L_rows[col];
+            int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
+
+            for (int k = jColStart; k < ij; ++k) {
+                int ki = Ut_cols[k];
+                while (ik < ki) {
+                    ++jk;
+                    ik = L_cols[jk];
+                }
+                if (ik == ki) {
+                    blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
+                }
+            }
+
+            for(int z = 0; z < bs*bs; ++z){
+                Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z];
+            }
+        }
+
+        // update L
+        int iRowStart = L_rows[row];
+        int iRowEnd = L_rows[row + 1];
+
+        for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
+            int j = L_cols[ij];
+            // // refine Lij element
+            int i1 = LU_rows[row];
+            int i2 = LU_rows[row+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= j) {
+                    break;
+                }
+            }
+
+            for(int z = 0; z < bs*bs; ++z){
+                aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
+            }
+
+            int jk = Ut_rows[j];
+            int ik = Ut_cols[jk];
+            for (int k = iRowStart; k < ij; ++k) {
+                int ki = L_cols[k];
+                while(ik < ki) {
+                    ++jk;
+                    ik = Ut_cols[jk];
+                }
+
+                if(ik == ki) {
+                    blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
+                }
+            }
+            // calculate aij / ujj
+            inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs);
+
+            // lij = aij / ujj
+            blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs);
+        }
+    }
+}
+)";
+
+#endif
+
+
+
+
+
+
+
+void FGPILU::decomposition(
+    cl::CommandQueue *queue, cl::Context *context,
+    int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
+    int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
+    int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
+    int Nb, int num_sweeps, int verbosity)
+{
+    const int block_size = 3;
+
+    try {
+        if (initialized == false) {
+            cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
+            cl::Program program = cl::Program(*context, source, &err);
+
+            std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
+            program.build(devices);
+
+            fgpilu_sweep_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                     cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                     cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                     cl::Buffer&, cl::Buffer&,
+                                                     const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "fgpilu_sweep", &err)));
+
+            // allocate GPU memory
+            d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
+            d_L_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
+            d_LU_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * LU_nnzbs * block_size * block_size);
+            d_Ut_ptrs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
+            d_L_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
+            d_LU_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
+            d_Ut_idxs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Ut_nnzbs);
+            d_L_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * L_nnzbs);
+            d_LU_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LU_nnzbs);
+            d_Ltmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
+            d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
+
+            Dune::Timer t_copy_pattern;
+            err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_TRUE, 0, sizeof(int) * (Nb+1), Ut_ptrs);
+            err |= queue->enqueueWriteBuffer(d_L_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), L_rows);
+            err |= queue->enqueueWriteBuffer(d_LU_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), LU_rows);
+            err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_TRUE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs);
+            err |= queue->enqueueWriteBuffer(d_L_cols, CL_TRUE, 0, sizeof(int) * L_nnzbs, L_cols);
+            err |= queue->enqueueWriteBuffer(d_LU_cols, CL_TRUE, 0, sizeof(int) * LU_nnzbs, LU_cols);
+            if (verbosity >= 3){
+                std::ostringstream out;
+                out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
+                OpmLog::info(out.str());
+            }
+            if (verbosity >= 2){
+                std::ostringstream out;
+                out << "FGPILU PARALLEL: " << PARALLEL;
+                OpmLog::info(out.str());
+            }
+
+            initialized = true;
+        }
+
+
+        // copy to GPU
+        Dune::Timer t_copy1;
+        err = queue->enqueueWriteBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
+        err |= queue->enqueueWriteBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+        err |= queue->enqueueWriteBuffer(d_LU_vals, CL_TRUE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals);
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "FGPILU copy1 time: " << t_copy1.stop() << " s";
+            OpmLog::info(out.str());
+        }
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueWriteBuffer error");
+        }
+
+        // call kernel
+        for (int sweep = 0; sweep < num_sweeps; ++sweep) {
+            // normally, L_vals and Ltmp are swapped after the sweep is done
+            // these conditionals implement that without actually swapping pointers
+            // 1st sweep reads X_vals, writes to Xtmp
+            // 2nd sweep reads Xtmp, writes to X_vals
+            auto *Larg1 = (sweep % 2 == 0) ? &d_L_vals : &d_Ltmp;
+            auto *Larg2 = (sweep % 2 == 0) ? &d_Ltmp : &d_L_vals;
+            auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
+            auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
+            int num_work_groups = Nb;
+#if PARALLEL
+            int work_group_size = 32;
+#else
+            int work_group_size = 16;
+#endif
+            int total_work_items = num_work_groups * work_group_size;
+            int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double);
+            Dune::Timer t_kernel;
+            event = (*fgpilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                *Uarg1, *Larg1, d_LU_vals,
+                d_Ut_ptrs, d_L_rows, d_LU_rows,
+                d_Ut_idxs, d_L_cols, d_LU_cols,
+                *Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group));
+            event.wait();
+            if (verbosity >= 3){
+                std::ostringstream out;
+                out << "FGPILU sweep kernel time: " << t_copy1.stop() << " s";
+                OpmLog::info(out.str());
+            }
+        }
+
+        // copy back
+        Dune::Timer t_copy2;
+        if (num_sweeps % 2 == 0) {
+            err = queue->enqueueReadBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
+            err |= queue->enqueueReadBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+        } else {
+            err = queue->enqueueReadBuffer(d_Utmp, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
+            err |= queue->enqueueReadBuffer(d_Ltmp, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+        }
+        err |= queue->enqueueBarrierWithWaitList();
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "FGPILU copy2 time: " << t_copy2.stop() << " s";
+            OpmLog::info(out.str());
+        }
+        if (err != CL_SUCCESS) {
+            // enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueReadBuffer error");
+        }
+
+    } catch (const cl::Error& error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err()) << std::endl;
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (const std::logic_error& error) {
+        // rethrow exception by OPM_THROW in the try{}
+        throw error;
+    }
+}
+
+
+} // end namespace bda
+
--- a/opm/simulators/linalg/bda/fgpilu.hpp
+++ b/opm/simulators/linalg/bda/fgpilu.hpp
@ -0,0 +1,86 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef FGPILU_HEADER_INCLUDED
+#define FGPILU_HEADER_INCLUDED
+
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+
+
+
+namespace bda
+{
+
+    // This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
+    //     FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+    // only blocksize == 3 is supported
+    // decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
+    class FGPILU
+    {
+    private:
+        cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
+        cl::Buffer d_Ut_ptrs, d_Ut_idxs;
+        cl::Buffer d_L_rows, d_L_cols;
+        cl::Buffer d_LU_rows, d_LU_cols;
+        cl::Buffer d_Ltmp, d_Utmp;
+
+        cl::Event event;
+        cl_int err;
+
+        std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&,
+                                        const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > fgpilu_sweep_k;
+
+        bool initialized = false;
+
+    public:
+        /// Executes the FGPILU sweeps
+        /// also copies data from CPU to GPU and GPU to CPU
+        /// \param[in] queue         OpenCL commandqueue
+        /// \param[in] context       OpenCL context
+        /// \param[in] Ut_ptrs       BSC columnpointers
+        /// \param[in] Ut_idxs       BSC rowindices
+        /// \param[inout] Ut_vals    actual nonzeros for U
+        /// \param[in] Ut_nnzbs      number of blocks in U
+        /// \param[in] L_rows        BSR rowpointers
+        /// \param[in] L_cols        BSR columnindices
+        /// \param[inout] L_vals     actual nonzeroes for L
+        /// \param[in] L_nnzbs       number of blocks in L
+        /// \param[in] LU_rows       BSR rowpointers
+        /// \param[in] LU_cols       BSR columnindices
+        /// \param[in] LU_vals       actual nonzeroes for LU (original matrix)
+        /// \param[in] LU_nnzbs      number of blocks in LU
+        /// \param[in] Nb            number of blockrows
+        /// \param[in] num_sweeps    number of sweeps to be done
+        /// \param[in] verbosity     print verbosity
+        void decomposition(
+            cl::CommandQueue *queue, cl::Context *context,
+            int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
+            int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
+            int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
+            int Nb, int num_sweeps, int verbosity);
+
+    };
+
+} // end namespace bda
+
+#endif // FGPILU_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/opencl.cpp
+++ b/opm/simulators/linalg/bda/opencl.cpp
@ -88,6 +88,9 @@ std::string getErrorString(cl_int error)
        case -67: return "CL_INVALID_LINKER_OPTIONS";
        case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";

+        // vendor specific errors
+        case -9999: return "NVIDIA_OPENCL_ILLEGAL_READ_OR_WRITE_TO_BUFFER";
+
        default: return "UNKNOWN_CL_CODE";
    }
 }