From 9f92a690377cfff031e506d2fe154738dd8ffa13 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Thu, 17 Dec 2020 14:49:59 +0100
Subject: [PATCH 01/14] Added CPU and GPU implementations of Fine-Grained
 Parallel ILU (FGPILU)

---
 CMakeLists_files.cmake                      |   2 +
 opm/simulators/linalg/bda/BILU0.cpp         | 262 ++++++++-
 opm/simulators/linalg/bda/BILU0.hpp         |   5 +
 opm/simulators/linalg/bda/BdaBridge.cpp     |   2 +
 opm/simulators/linalg/bda/BlockedMatrix.cpp |  12 +
 opm/simulators/linalg/bda/BlockedMatrix.hpp |   8 +
 opm/simulators/linalg/bda/ILUReorder.hpp    |   3 +-
 opm/simulators/linalg/bda/fgpilu.cpp        | 570 ++++++++++++++++++++
 opm/simulators/linalg/bda/fgpilu.hpp        |  86 +++
 opm/simulators/linalg/bda/opencl.cpp        |   3 +
 10 files changed, 949 insertions(+), 4 deletions(-)
 create mode 100644 opm/simulators/linalg/bda/fgpilu.cpp
 create mode 100644 opm/simulators/linalg/bda/fgpilu.hpp

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 70cc1c3df..70cb66a2a 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -58,6 +58,7 @@ if(OPENCL_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/fgpilu.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
@@ -168,6 +169,7 @@ list (APPEND PUBLIC_HEADER_FILES
   opm/simulators/linalg/bda/BlockedMatrix.hpp
   opm/simulators/linalg/bda/cuda_header.hpp
   opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+  opm/simulators/linalg/bda/fgpilu.hpp
   opm/simulators/linalg/bda/Reorder.hpp
   opm/simulators/linalg/bda/ILUReorder.hpp
   opm/simulators/linalg/bda/opencl.hpp
diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index 24905520d..dedb7d222 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -25,12 +25,21 @@
 
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/fgpilu.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
 
 
 namespace bda
 {
 
+// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
+// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
+//    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
+// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
+#define CHOW_PATEL     1
+#define CHOW_PATEL_GPU 1
+
     using Opm::OpmLog;
     using Dune::Timer;
 
@@ -82,11 +91,26 @@ namespace bda
         } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
             out << "BILU0 reordering strategy: " << "graph_coloring\n";
             findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
+            out << "BILU0 reordering strategy: none\n";
+            numColors = 1;
+            rowsPerColor.emplace_back(Nb);
+            // numColors = Nb;
+            // for(int i = 0; i < Nb; ++i){
+            //     rowsPerColor.emplace_back(1);
+            // }
+            for(int i = 0; i < Nb; ++i){
+                toOrder[i] = i;
+                fromOrder[i] = i;
+            }
         } else {
             OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
         }
         if(verbosity >= 3){
-            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
+            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+        }
+        if(verbosity >= 2){
+            out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
         }
         OpmLog::info(out.str());
 
@@ -129,6 +153,233 @@ namespace bda
         return true;
     } // end init()
 
+    // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
+    template <unsigned int block_size>
+    void BILU0<block_size>::chow_patel_decomposition()
+    {
+        const unsigned int bs = block_size;
+        int num_sweeps = 6;
+
+        // split matrix into L and U
+        // also convert U into BSC format (Ut)
+        // Ut stores diagonal for now
+        int num_blocks_L = 0;
+
+        // Ut is actually BSC format
+        std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Umat->Nb, Umat->nnzbs + Umat->Nb);
+
+        Lmat->rowPointers[0] = 0;
+        for (int i = 0; i < Nb+1; i++) {
+            Ut->rowPointers[i] = 0;
+        }
+
+        // for every row
+        for (int i = 0; i < Nb; i++) {
+            int iRowStart = LUmat->rowPointers[i];
+            int iRowEnd = LUmat->rowPointers[i + 1];
+            // for every block in this row
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUmat->colIndices[ij];
+                if (i <= j) {
+                    Ut->rowPointers[j+1]++;   // actually colPointers
+                } else {
+                    Lmat->colIndices[num_blocks_L] = j;
+                    memcpy(Lmat->nnzValues + num_blocks_L * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
+                    num_blocks_L++;
+                }
+            }
+            Lmat->rowPointers[i+1] = num_blocks_L;
+        }
+
+        // prefix sum
+        int sum = 0;
+        for (int i = 1; i < Nb+1; i++) {
+            sum += Ut->rowPointers[i];
+            Ut->rowPointers[i] = sum;
+        }
+
+        // for every row
+        for (int i = 0; i < Nb; i++) {
+            int iRowStart = LUmat->rowPointers[i];
+            int iRowEnd = LUmat->rowPointers[i + 1];
+            // for every block in this row
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUmat->colIndices[ij];
+                if (i <= j){
+                    int idx = Ut->rowPointers[j]++;
+                    Ut->colIndices[idx] = i;     // actually rowIndices
+                    memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
+                }
+            }
+        }
+
+        // rotate
+        // the Ut->rowPointers were increased in the last loop
+        for (int i = Nb; i > 0; --i) {
+            Ut->rowPointers[i] = Ut->rowPointers[i-1];
+        }
+        Ut->rowPointers[0] = 0;
+
+        Opm::Detail::Inverter<bs> inverter;
+
+        // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed at the end
+        // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
+        double *Utmp = new double[Ut->nnzbs * block_size * block_size];
+
+        // actual ILU decomposition
+#if CHOW_PATEL_GPU
+        fgpilu.decomposition(queue, context,
+                    Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
+                    Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
+                    LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
+                    Nb, num_sweeps, verbosity);
+#else
+        double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
+        for (int sweep = 0; sweep < num_sweeps; ++sweep) {
+
+            // for every row
+            for (int row = 0; row < Nb; row++) {
+                // update U
+                int jColStart = Ut->rowPointers[row];
+                int jColEnd = Ut->rowPointers[row + 1];
+                // for every block in this row
+                for (int ij = jColStart; ij < jColEnd; ij++) {
+                    int col = Ut->colIndices[ij];
+                    // refine Uij element (or diagonal)
+                    int i1 = LUmat->rowPointers[col];
+                    int i2 = LUmat->rowPointers[col+1];
+                    int kk = 0;
+                    for(kk = i1; kk < i2; ++kk) {
+                        ptrdiff_t c = LUmat->colIndices[kk];
+                        if (c >= row) {
+                            break;
+                        }
+                    }
+                    double aij[bs*bs];
+                    memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
+                    int jk = Lmat->rowPointers[col];
+                    int ik = (jk < Lmat->rowPointers[col+1]) ? Lmat->colIndices[jk] : Nb;
+
+                    for (int k = jColStart; k < ij; ++k) {
+                        int ki = Ut->colIndices[k];
+                        while (ik < ki) {
+                            ++jk;
+                            ik = Lmat->colIndices[jk];
+                        }
+                        if (ik == ki) {
+                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
+                        }
+                    }
+
+                    memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
+                }
+
+                // update L
+                int iRowStart = Lmat->rowPointers[row];
+                int iRowEnd = Lmat->rowPointers[row + 1];
+
+                for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                    int j = Lmat->colIndices[ij];
+                    // refine Lij element
+                    int i1 = LUmat->rowPointers[row];
+                    int i2 = LUmat->rowPointers[row+1];
+                    int kk = 0;
+                    for(kk = i1; kk < i2; ++kk) {
+                        ptrdiff_t c = LUmat->colIndices[kk];
+                        if (c >= j) {
+                            break;
+                        }
+                    }
+                    double aij[bs*bs];
+                    memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
+                    int jk = Ut->rowPointers[j];
+                    int ik = Ut->colIndices[jk];
+                    for (int k = iRowStart; k < ij; ++k) {
+                        int ki = Lmat->colIndices[k];
+                        while(ik < ki) {
+                            ++jk;
+                            ik = Ut->colIndices[jk];
+                        }
+
+                        if(ik == ki) {
+                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
+                        }
+                    }
+                    // calculate aij / ujj
+                    double ujj[bs*bs];
+                    inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
+                    // lij = aij / ujj
+                    blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
+
+                }
+            }
+            double *t = Lmat->nnzValues;
+            Lmat->nnzValues = Ltmp;
+            Ltmp = t;
+            t = Ut->nnzValues;
+            Ut->nnzValues = Utmp;
+            Utmp = t;
+        } // end sweep
+        delete[] Ltmp;
+#endif
+
+        // convert Ut to BSR
+        // diagonal stored separately
+        std::vector<int> ptr(Nb+1, 0);
+        std::vector<int> col(Ut->rowPointers[Nb]);
+
+        // count blocks per row for U (BSR)
+        // store diagonal in invDiagVals
+        for(int i = 0; i < Nb; ++i) {
+            for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
+                int j = Ut->colIndices[k];
+                if (j == i) {
+                    inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
+                } else {
+                    ++ptr[j+1];
+                }
+            }
+        }
+
+        // prefix sum
+        int sumU = 0;
+        for (int i = 1; i < Nb+1; i++) {
+            sumU += ptr[i];
+            ptr[i] = sumU;
+        }
+
+        // actually copy nonzero values for U
+        for(int i = 0; i < Nb; ++i) {
+            for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
+                int j = Ut->colIndices[k];
+                if (j != i) {
+                    int head = ptr[j]++;
+                    col[head]  = i;
+                    memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
+                }
+            }
+        }
+
+        // the ptr[] were increased in the last loop
+        std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
+        ptr.front() = 0;
+
+        // reversing the rows of U, because that is the order they are used in
+        int URowIndex = 0;
+        int offsetU = 0;   // number of nnz blocks that are already copied to Umat
+        Umat->rowPointers[0] = 0;
+        for (int i = LUmat->Nb - 1; i >= 0; i--) {
+            int rowSize = ptr[i + 1] - ptr[i];   // number of blocks in this row
+            memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
+            memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
+            offsetU += rowSize;
+            Umat->rowPointers[URowIndex + 1] = offsetU;
+            URowIndex++;
+        }
+
+        delete[] Utmp;
+
+    }
 
     template <unsigned int block_size>
     bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
@@ -153,6 +404,10 @@ namespace bda
             OpmLog::info(out.str());
         }
 
+        Timer t_decomposition;
+#if CHOW_PATEL
+        chow_patel_decomposition();
+#else
         int i, j, ij, ik, jk;
         int iRowStart, iRowEnd, jRowEnd;
         double pivot[bs * bs];
@@ -160,8 +415,6 @@ namespace bda
         int LSize = 0;
         Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks
 
-        Timer t_decomposition;
-
         // go through all rows
         for (i = 0; i < LUmat->Nb; i++) {
             iRowStart = LUmat->rowPointers[i];
@@ -239,6 +492,7 @@ namespace bda
             Umat->rowPointers[URowIndex + 1] = offsetU;
             URowIndex++;
         }
+#endif
         if (verbosity >= 3) {
             std::ostringstream out;
             out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
@@ -278,6 +532,7 @@ namespace bda
             event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
             // event.wait();
         }
+
         for(int color = numColors-1; color >= 0; --color){
             event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
             // event.wait();
@@ -320,6 +575,7 @@ namespace bda
 template BILU0<n>::BILU0(ILUReorder, int);                                               \
 template BILU0<n>::~BILU0();                                                             \
 template bool BILU0<n>::init(BlockedMatrix<n>*);                                         \
+template void BILU0<n>::chow_patel_decomposition();                                      \
 template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*);                        \
 template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y);                             \
 template void BILU0<n>::setOpenCLContext(cl::Context*);                                  \
diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp
index bad6201ca..5a31c57a9 100644
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 
 #include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/fgpilu.hpp>
 
 namespace bda
 {
@@ -67,6 +68,10 @@ namespace bda
         int lmem_per_work_group = 0;
         bool pattern_uploaded = false;
 
+        FGPILU fgpilu;
+
+        void chow_patel_decomposition();
+
     public:
 
         BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index 71e797450..39d403cb7 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -58,6 +58,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
             ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
         } else if (opencl_ilu_reorder == "graph_coloring") {
             ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
+        } else if (opencl_ilu_reorder == "none") {
+            ilu_reorder = bda::ILUReorder::NONE;
         } else {
             OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
         }
diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp
index f4ffc0706..050a17a95 100644
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@@ -93,11 +93,23 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
     }
 }
 
+// subtract c from b and store in a
+// a = b - c
+template <unsigned int block_size>
+void blockSub(double *a, double *b, double *c)
+{
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
+        }
+    }
+}
 
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                 \
 template void sortBlockedRow<n>(int *, double *, int, int);          \
 template void blockMultSub<n>(double *, double *, double *);         \
 template void blockMult<n>(double *, double *, double *);            \
+template void blockSub<n>(double *, double *, double *);             \
 
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp
index 39a573bb5..80488a8dc 100644
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@@ -109,6 +109,14 @@ void blockMultSub(double *a, double *b, double *c);
 template <unsigned int block_size>
 void blockMult(double *mat1, double *mat2, double *resMat);
 
+/// Subtract blocks
+/// a = b - c
+/// \param[out] a                result block
+/// \param[in] b                 input block
+/// \param[in] c                 input block
+template <unsigned int block_size>
+void blockSub(double *a, double *b, double *c);
+
 } // end namespace bda
 
 #endif
diff --git a/opm/simulators/linalg/bda/ILUReorder.hpp b/opm/simulators/linalg/bda/ILUReorder.hpp
index 83039cf6d..b7ea06ce2 100644
--- a/opm/simulators/linalg/bda/ILUReorder.hpp
+++ b/opm/simulators/linalg/bda/ILUReorder.hpp
@@ -27,7 +27,8 @@ namespace bda
         
     enum class ILUReorder {
         LEVEL_SCHEDULING,
-        GRAPH_COLORING
+        GRAPH_COLORING,
+        NONE
     };
 
 }
diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp
new file mode 100644
index 000000000..508fcafe0
--- /dev/null
+++ b/opm/simulators/linalg/bda/fgpilu.cpp
@@ -0,0 +1,570 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/fgpilu.hpp>
+
+namespace bda
+{
+
+    using Opm::OpmLog;
+
+// if PARALLEL is 0:
+//    Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
+//    Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
+//    except when the number of blocks in that row exceeds the number of workitems per workgroup.
+//    In that case some workitems will process multiple blocks sequentially.
+// else:
+//    Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially
+//    Each block in a row gets a warp of 32 workitems, of which 9 are always active.
+//    Multiple blocks can be processed in parallel if a workgroup contains multiple warps.
+//    If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
+
+// Notes:
+// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
+// PARALLEL 1 should be run with at least 32 workitems per workgroup.
+// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
+// PARALLEL 0 is generally faster, despite not having parallelization.
+
+#define PARALLEL 0
+
+#if PARALLEL
+
+inline const char* fgpilu_sweep_s  = R"(
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+void blockMultSub(
+    __local double * restrict a,
+    __global const double * restrict b,
+    __global const double * restrict c)
+{
+    const unsigned int block_size = 3;
+    const unsigned int warp_size = 32;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    if(thread_id_in_warp < block_size * block_size){
+        const unsigned int row = thread_id_in_warp / block_size;
+        const unsigned int col = thread_id_in_warp % block_size;
+        double temp = 0.0;
+        for (unsigned int k = 0; k < block_size; k++) {
+            temp += b[block_size * row + k] * c[block_size * k + col];
+        }
+        a[block_size * row + col] -= temp;
+    }
+}
+
+
+void blockMult(
+    __local const double * restrict mat1,
+    __local const double * restrict mat2,
+    __global double * restrict resMat)
+{
+    const unsigned int block_size = 3;
+    const unsigned int warp_size = 32;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    if(thread_id_in_warp < block_size * block_size){
+        const unsigned int row = thread_id_in_warp / block_size;
+        const unsigned int col = thread_id_in_warp % block_size;
+        double temp = 0.0;
+        for (unsigned int k = 0; k < block_size; k++) {
+            temp += mat1[block_size * row + k] * mat2[block_size * k + col];
+        }
+        resMat[block_size * row + col] = temp;
+    }
+}
+
+
+void invert(
+    __global const double * restrict matrix,
+    __local double * restrict inverse)
+{
+    const unsigned int block_size = 3;
+    const unsigned int bs = block_size;
+    const unsigned int warp_size = 32;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    if(thread_id_in_warp < block_size * block_size){
+        // code generated by maple, copied from Dune::DenseMatrix
+        double t4  = matrix[0] * matrix[4];
+        double t6  = matrix[0] * matrix[5];
+        double t8  = matrix[1] * matrix[3];
+        double t10 = matrix[2] * matrix[3];
+        double t12 = matrix[1] * matrix[6];
+        double t14 = matrix[2] * matrix[6];
+
+        double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                      t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+        double t17 = 1.0 / det;
+
+        const unsigned int r = thread_id_in_warp / block_size;
+        const unsigned int c = thread_id_in_warp % block_size;
+        const unsigned int r1 = (r+1) % bs;
+        const unsigned int c1 = (c+1) % bs;
+        const unsigned int r2 = (r+bs-1) % bs;
+        const unsigned int c2 = (c+bs-1) % bs;
+        inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
+    }
+}
+
+__kernel void fgpilu_sweep(
+    __global const double * restrict Ut_vals,
+    __global const double * restrict L_vals,
+    __global const double * restrict LU_vals,
+    __global const int * restrict Ut_rows,
+    __global const int * restrict L_rows,
+    __global const int * restrict LU_rows,
+    __global const int * restrict Ut_cols,
+    __global const int * restrict L_cols,
+    __global const int * restrict LU_cols,
+    __global double * restrict Ltmp,
+    __global double * restrict Utmp,
+    const int Nb,
+    __local double *aij,
+    __local double *ujj)
+{
+    const int bs = 3;
+
+    // for every row
+    const unsigned int warp_size = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int num_groups = get_num_groups(0);
+    const unsigned int warps_per_group = bsize / warp_size;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    const unsigned int warp_id_in_group = idx_t / warp_size;
+    const unsigned int lmem_offset = warp_id_in_group * bs * bs;            // each workgroup gets some lmem, but the workitems have to share it
+    for (int row = idx_b; row < Nb; row+=num_groups) {
+        int jColStart = Ut_rows[row];
+        int jColEnd = Ut_rows[row + 1];
+        for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) {
+            int col = Ut_cols[ij];
+            // refine Uij element (or diagonal)
+            int i1 = LU_rows[col];
+            int i2 = LU_rows[col+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= row) {
+                    break;
+                }
+            }
+
+            if(thread_id_in_warp < bs*bs){
+                aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
+            }
+
+            int jk = L_rows[col];
+            int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
+
+            for (int k = jColStart; k < ij; ++k) {
+                int ki = Ut_cols[k];
+                while (ik < ki) {
+                    ++jk;
+                    ik = L_cols[jk];
+                }
+                if (ik == ki) {
+                    blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
+                }
+            }
+
+            if(thread_id_in_warp < bs*bs){
+                Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp];
+            }
+        }
+
+        // update L
+        int iRowStart = L_rows[row];
+        int iRowEnd = L_rows[row + 1];
+
+        for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) {
+        // for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
+            int j = L_cols[ij];
+            // // refine Lij element
+            int i1 = LU_rows[row];
+            int i2 = LU_rows[row+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= j) {
+                    break;
+                }
+            }
+
+            if(thread_id_in_warp < bs*bs){
+                aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
+            }
+
+            int jk = Ut_rows[j];
+            int ik = Ut_cols[jk];
+            for (int k = iRowStart; k < ij; ++k) {
+                int ki = L_cols[k];
+                while(ik < ki) {
+                    ++jk;
+                    ik = Ut_cols[jk];
+                }
+
+                if(ik == ki) {
+                    blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
+                }
+            }
+
+            // calculate aij / ujj
+            invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset);
+
+            // lij = aij / ujj
+            blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs);
+        }
+    }
+}
+)";
+
+#else
+
+inline const char* fgpilu_sweep_s  = R"(
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+void blockMultSub(
+    __local double * restrict a,
+    __global const double * restrict b,
+    __global const double * restrict c)
+{
+    const unsigned int block_size = 3;
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += b[block_size * row + k] * c[block_size * k + col];
+            }
+            a[block_size * row + col] -= temp;
+        }
+    }
+}
+
+
+void blockMult(
+    __local const double * restrict mat1,
+    __local const double * restrict mat2,
+    __global double * restrict resMat)
+{
+    const unsigned int block_size = 3;
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += mat1[block_size * row + k] * mat2[block_size * k + col];
+            }
+            resMat[block_size * row + col] = temp;
+        }
+    }
+}
+
+
+__kernel void inverter(
+    __global const double * restrict matrix,
+    __local double * restrict inverse)
+{
+    // code generated by maple, copied from Dune::DenseMatrix
+    double t4  = matrix[0] * matrix[4];
+    double t6  = matrix[0] * matrix[5];
+    double t8  = matrix[1] * matrix[3];
+    double t10 = matrix[2] * matrix[3];
+    double t12 = matrix[1] * matrix[6];
+    double t14 = matrix[2] * matrix[6];
+
+    double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                  t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+    double t17 = 1.0 / det;
+
+    inverse[0] =  (matrix[4] * matrix[8] - matrix[5] * matrix[7]) * t17;
+    inverse[1] = -(matrix[1] * matrix[8] - matrix[2] * matrix[7]) * t17;
+    inverse[2] =  (matrix[1] * matrix[5] - matrix[2] * matrix[4]) * t17;
+    inverse[3] = -(matrix[3] * matrix[8] - matrix[5] * matrix[6]) * t17;
+    inverse[4] =  (matrix[0] * matrix[8] - t14) * t17;
+    inverse[5] = -(t6 - t10) * t17;
+    inverse[6] =  (matrix[3] * matrix[7] - matrix[4] * matrix[6]) * t17;
+    inverse[7] = -(matrix[0] * matrix[7] - t12) * t17;
+    inverse[8] =  (t4 - t8) * t17;
+}
+
+__kernel void fgpilu_sweep(
+    __global const double * restrict Ut_vals,
+    __global const double * restrict L_vals,
+    __global const double * restrict LU_vals,
+    __global const int * restrict Ut_rows,
+    __global const int * restrict L_rows,
+    __global const int * restrict LU_rows,
+    __global const int * restrict Ut_cols,
+    __global const int * restrict L_cols,
+    __global const int * restrict LU_cols,
+    __global double * restrict Ltmp,
+    __global double * restrict Utmp,
+    const int Nb,
+    __local double *aij,
+    __local double *ujj)
+{
+    const int bs = 3;
+
+    // for every row
+    const unsigned int warp_size = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int num_groups = get_num_groups(0);
+    const unsigned int warps_per_group = bsize / warp_size;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
+    const unsigned int warp_id_in_group = idx_t / warp_size;
+    for (int row = idx_b; row < Nb; row+=num_groups) {
+        int jColStart = Ut_rows[row];
+        int jColEnd = Ut_rows[row + 1];
+        for (int ij = jColStart + idx_t; ij < jColEnd; ij+=bsize) {
+            int col = Ut_cols[ij];
+            // refine Uij element (or diagonal)
+            int i1 = LU_rows[col];
+            int i2 = LU_rows[col+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= row) {
+                    break;
+                }
+            }
+
+            for(int z = 0; z < bs*bs; ++z){
+                aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
+            }
+
+            int jk = L_rows[col];
+            int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
+
+            for (int k = jColStart; k < ij; ++k) {
+                int ki = Ut_cols[k];
+                while (ik < ki) {
+                    ++jk;
+                    ik = L_cols[jk];
+                }
+                if (ik == ki) {
+                    blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
+                }
+            }
+
+            for(int z = 0; z < bs*bs; ++z){
+                Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z];
+            }
+        }
+
+        // update L
+        int iRowStart = L_rows[row];
+        int iRowEnd = L_rows[row + 1];
+
+        for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
+            int j = L_cols[ij];
+            // // refine Lij element
+            int i1 = LU_rows[row];
+            int i2 = LU_rows[row+1];
+            int kk = 0;
+            for(kk = i1; kk < i2; ++kk) {
+                int c = LU_cols[kk];
+                if (c >= j) {
+                    break;
+                }
+            }
+
+            for(int z = 0; z < bs*bs; ++z){
+                aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
+            }
+
+            int jk = Ut_rows[j];
+            int ik = Ut_cols[jk];
+            for (int k = iRowStart; k < ij; ++k) {
+                int ki = L_cols[k];
+                while(ik < ki) {
+                    ++jk;
+                    ik = Ut_cols[jk];
+                }
+
+                if(ik == ki) {
+                    blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
+                }
+            }
+            // calculate aij / ujj
+            inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs);
+
+            // lij = aij / ujj
+            blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs);
+        }
+    }
+}
+)";
+
+#endif
+
+
+
+
+
+
+
+void FGPILU::decomposition(
+    cl::CommandQueue *queue, cl::Context *context,
+    int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
+    int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
+    int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
+    int Nb, int num_sweeps, int verbosity)
+{
+    const int block_size = 3;
+
+    try {
+        if (initialized == false) {
+            cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
+            cl::Program program = cl::Program(*context, source, &err);
+
+            std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
+            program.build(devices);
+
+            fgpilu_sweep_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                     cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                     cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                     cl::Buffer&, cl::Buffer&,
+                                                     const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "fgpilu_sweep", &err)));
+
+            // allocate GPU memory
+            d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
+            d_L_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
+            d_LU_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * LU_nnzbs * block_size * block_size);
+            d_Ut_ptrs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
+            d_L_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
+            d_LU_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
+            d_Ut_idxs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Ut_nnzbs);
+            d_L_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * L_nnzbs);
+            d_LU_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LU_nnzbs);
+            d_Ltmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
+            d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
+
+            Dune::Timer t_copy_pattern;
+            err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_TRUE, 0, sizeof(int) * (Nb+1), Ut_ptrs);
+            err |= queue->enqueueWriteBuffer(d_L_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), L_rows);
+            err |= queue->enqueueWriteBuffer(d_LU_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), LU_rows);
+            err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_TRUE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs);
+            err |= queue->enqueueWriteBuffer(d_L_cols, CL_TRUE, 0, sizeof(int) * L_nnzbs, L_cols);
+            err |= queue->enqueueWriteBuffer(d_LU_cols, CL_TRUE, 0, sizeof(int) * LU_nnzbs, LU_cols);
+            if (verbosity >= 3){
+                std::ostringstream out;
+                out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
+                OpmLog::info(out.str());
+            }
+            if (verbosity >= 2){
+                std::ostringstream out;
+                out << "FGPILU PARALLEL: " << PARALLEL;
+                OpmLog::info(out.str());
+            }
+
+            initialized = true;
+        }
+
+
+        // copy to GPU
+        Dune::Timer t_copy1;
+        err = queue->enqueueWriteBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
+        err |= queue->enqueueWriteBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+        err |= queue->enqueueWriteBuffer(d_LU_vals, CL_TRUE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals);
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "FGPILU copy1 time: " << t_copy1.stop() << " s";
+            OpmLog::info(out.str());
+        }
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueWriteBuffer error");
+        }
+
+        // call kernel
+        for (int sweep = 0; sweep < num_sweeps; ++sweep) {
+            // normally, L_vals and Ltmp are swapped after the sweep is done
+            // these conditionals implement that without actually swapping pointers
+            // 1st sweep reads X_vals, writes to Xtmp
+            // 2nd sweep reads Xtmp, writes to X_vals
+            auto *Larg1 = (sweep % 2 == 0) ? &d_L_vals : &d_Ltmp;
+            auto *Larg2 = (sweep % 2 == 0) ? &d_Ltmp : &d_L_vals;
+            auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
+            auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
+            int num_work_groups = Nb;
+#if PARALLEL
+            int work_group_size = 32;
+#else
+            int work_group_size = 16;
+#endif
+            int total_work_items = num_work_groups * work_group_size;
+            int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double);
+            Dune::Timer t_kernel;
+            event = (*fgpilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                *Uarg1, *Larg1, d_LU_vals,
+                d_Ut_ptrs, d_L_rows, d_LU_rows,
+                d_Ut_idxs, d_L_cols, d_LU_cols,
+                *Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group));
+            event.wait();
+            if (verbosity >= 3){
+                std::ostringstream out;
+                out << "FGPILU sweep kernel time: " << t_copy1.stop() << " s";
+                OpmLog::info(out.str());
+            }
+        }
+
+        // copy back
+        Dune::Timer t_copy2;
+        if (num_sweeps % 2 == 0) {
+            err = queue->enqueueReadBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
+            err |= queue->enqueueReadBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+        } else {
+            err = queue->enqueueReadBuffer(d_Utmp, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
+            err |= queue->enqueueReadBuffer(d_Ltmp, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+        }
+        err |= queue->enqueueBarrierWithWaitList();
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "FGPILU copy2 time: " << t_copy2.stop() << " s";
+            OpmLog::info(out.str());
+        }
+        if (err != CL_SUCCESS) {
+            // enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueReadBuffer error");
+        }
+
+    } catch (const cl::Error& error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err()) << std::endl;
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (const std::logic_error& error) {
+        // rethrow exception by OPM_THROW in the try{}
+        throw error;
+    }
+}
+
+
+} // end namespace bda
+
diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/fgpilu.hpp
new file mode 100644
index 000000000..d51c21061
--- /dev/null
+++ b/opm/simulators/linalg/bda/fgpilu.hpp
@@ -0,0 +1,86 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef FGPILU_HEADER_INCLUDED
+#define FGPILU_HEADER_INCLUDED
+
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+
+
+
+namespace bda
+{
+
+    // This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
+    //     FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
+    // only blocksize == 3 is supported
+    // decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
+    class FGPILU
+    {
+    private:
+        cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
+        cl::Buffer d_Ut_ptrs, d_Ut_idxs;
+        cl::Buffer d_L_rows, d_L_cols;
+        cl::Buffer d_LU_rows, d_LU_cols;
+        cl::Buffer d_Ltmp, d_Utmp;
+
+        cl::Event event;
+        cl_int err;
+
+        std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                        cl::Buffer&, cl::Buffer&,
+                                        const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > fgpilu_sweep_k;
+
+        bool initialized = false;
+
+    public:
+        /// Executes the FGPILU sweeps
+        /// also copies data from CPU to GPU and GPU to CPU
+        /// \param[in] queue         OpenCL commandqueue
+        /// \param[in] context       OpenCL context
+        /// \param[in] Ut_ptrs       BSC columnpointers
+        /// \param[in] Ut_idxs       BSC rowindices
+        /// \param[inout] Ut_vals    actual nonzeros for U
+        /// \param[in] Ut_nnzbs      number of blocks in U
+        /// \param[in] L_rows        BSR rowpointers
+        /// \param[in] L_cols        BSR columnindices
+        /// \param[inout] L_vals     actual nonzeroes for L
+        /// \param[in] L_nnzbs       number of blocks in L
+        /// \param[in] LU_rows       BSR rowpointers
+        /// \param[in] LU_cols       BSR columnindices
+        /// \param[in] LU_vals       actual nonzeroes for LU (original matrix)
+        /// \param[in] LU_nnzbs      number of blocks in LU
+        /// \param[in] Nb            number of blockrows
+        /// \param[in] num_sweeps    number of sweeps to be done
+        /// \param[in] verbosity     print verbosity
+        void decomposition(
+            cl::CommandQueue *queue, cl::Context *context,
+            int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
+            int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
+            int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
+            int Nb, int num_sweeps, int verbosity);
+
+    };
+
+} // end namespace bda
+
+#endif // FGPILU_HEADER_INCLUDED
diff --git a/opm/simulators/linalg/bda/opencl.cpp b/opm/simulators/linalg/bda/opencl.cpp
index 9108a189e..5030d6a6f 100644
--- a/opm/simulators/linalg/bda/opencl.cpp
+++ b/opm/simulators/linalg/bda/opencl.cpp
@@ -88,6 +88,9 @@ std::string getErrorString(cl_int error)
         case -67: return "CL_INVALID_LINKER_OPTIONS";
         case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
 
+        // vendor specific errors
+        case -9999: return "NVIDIA_OPENCL_ILLEGAL_READ_OR_WRITE_TO_BUFFER";
+
         default: return "UNKNOWN_CL_CODE";
     }
 }

From 71692ff52b61741c60b1c9e83a430a4bbdc7b3d3 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Thu, 17 Dec 2020 15:50:53 +0100
Subject: [PATCH 02/14] Disabled FGPILU by default

---
 opm/simulators/linalg/bda/BILU0.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index dedb7d222..c21c658a1 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -37,7 +37,7 @@ namespace bda
 //    FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
 // if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
 // if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
-#define CHOW_PATEL     1
+#define CHOW_PATEL     0
 #define CHOW_PATEL_GPU 1
 
     using Opm::OpmLog;

From f26e58c6ca4e8c10519e1960d8f9a3c12acce3c0 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Mon, 11 Jan 2021 16:51:05 +0100
Subject: [PATCH 03/14] Reordering is now actually skipped when NONE is chosen.
 Also made reordering optional for openclSolver.

---
 opm/simulators/linalg/bda/BILU0.cpp           | 75 +++++++++++--------
 .../bda/MultisegmentWellContribution.hpp      |  1 +
 .../linalg/bda/openclSolverBackend.cpp        | 42 ++++++++---
 .../linalg/bda/openclSolverBackend.hpp        |  7 +-
 4 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index c21c658a1..165106b50 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -53,8 +53,11 @@ namespace bda
     {
         delete[] invDiagVals;
         delete[] diagIndex;
-        delete[] toOrder;
-        delete[] fromOrder;
+        if (opencl_ilu_reorder != ILUReorder::NONE) {
+            delete[] toOrder;
+            delete[] fromOrder;
+        }
+        delete[] LUmat->nnzValues;
     }
 
     template <unsigned int block_size>
@@ -67,23 +70,29 @@ namespace bda
         this->nnz = mat->nnzbs * block_size * block_size;
         this->nnzbs = mat->nnzbs;
 
-        toOrder = new int[Nb];
-        fromOrder = new int[Nb];
+        int *CSCRowIndices = nullptr;
+        int *CSCColPointers = nullptr;
 
-        int *CSCRowIndices = new int[nnzbs];
-        int *CSCColPointers = new int[Nb + 1];
+        if (opencl_ilu_reorder == ILUReorder::NONE) {
+            LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
+        } else {
+            toOrder = new int[Nb];
+            fromOrder = new int[Nb];
+            CSCRowIndices = new int[nnzbs];
+            CSCColPointers = new int[Nb + 1];
+            rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
+            LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
 
-        Timer t_convert;
-        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
-        if(verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
-            OpmLog::info(out.str());
+            Timer t_convert;
+            csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
+            if(verbosity >= 3){
+                std::ostringstream out;
+                out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+                OpmLog::info(out.str());
+            }
         }
 
         Timer t_analysis;
-        rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
-        LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
         std::ostringstream out;
         if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
             out << "BILU0 reordering strategy: " << "level_scheduling\n";
@@ -93,15 +102,11 @@ namespace bda
             findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
         } else if (opencl_ilu_reorder == ILUReorder::NONE) {
             out << "BILU0 reordering strategy: none\n";
-            numColors = 1;
-            rowsPerColor.emplace_back(Nb);
-            // numColors = Nb;
-            // for(int i = 0; i < Nb; ++i){
-            //     rowsPerColor.emplace_back(1);
-            // }
+            // numColors = 1;
+            // rowsPerColor.emplace_back(Nb);
+            numColors = Nb;
             for(int i = 0; i < Nb; ++i){
-                toOrder[i] = i;
-                fromOrder[i] = i;
+                rowsPerColor.emplace_back(1);
             }
         } else {
             OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
@@ -114,8 +119,10 @@ namespace bda
         }
         OpmLog::info(out.str());
 
-        delete[] CSCRowIndices;
-        delete[] CSCColPointers;
+        if (opencl_ilu_reorder != ILUReorder::NONE) {
+            delete[] CSCRowIndices;
+            delete[] CSCColPointers;
+        }
 
         diagIndex = new int[mat->Nb];
         invDiagVals = new double[mat->Nb * bs * bs];
@@ -385,18 +392,24 @@ namespace bda
     bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
     {
         const unsigned int bs = block_size;
-        Timer t_reorder;
-        reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
+        auto *m = mat;
 
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
-            OpmLog::info(out.str());
+        if (opencl_ilu_reorder != ILUReorder::NONE) {
+            m = rmat.get();
+            Timer t_reorder;
+            reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
+
+            if (verbosity >= 3){
+                std::ostringstream out;
+                out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
+                OpmLog::info(out.str());
+            }
         }
 
         // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+        // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
         Timer t_copy;
-        memcpy(LUmat->nnzValues, rmat->nnzValues, sizeof(double) * bs * bs * rmat->nnzbs);
+        memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
 
         if (verbosity >= 3){
             std::ostringstream out;
diff --git a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
index 6b9e8fd67..3862b3840 100644
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@@ -126,6 +126,7 @@ public:
     /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
     /// Those indices need to be mapped via toOrder
     /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether reordering is actually used or not
     void setReordering(int *toOrder, bool reorder);
 };
 
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index d90c9ac6f..3c7aad43a 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -44,7 +44,7 @@ using Opm::OpmLog;
 using Dune::Timer;
 
 template <unsigned int block_size>
-openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
+openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
     prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
 
     cl_int err = CL_SUCCESS;
@@ -485,7 +485,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         prec->setOpenCLContext(context.get());
         prec->setOpenCLQueue(queue.get());
 
-        rb = new double[N];
         tmp = new double[N];
 #if COPY_ROW_BY_ROW
         vals_contiguous = new double[N];
@@ -508,7 +507,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
         d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
 
-        d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
+        bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
+        if (reorder) {
+            rb = new double[N];
+            d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
+        }
+        wcontainer->init(wellContribs, N, Nb, reorder);
 
         // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
         // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -544,7 +548,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
 
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
-    delete[] rb;
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        delete[] rb;
+    }
     delete[] tmp;
 #if COPY_ROW_BY_ROW
     delete[] vals_contiguous;
@@ -572,7 +578,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
     queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
     queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
     queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
-    queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
+    }
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
@@ -624,9 +632,13 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
     int lmem_per_work_group = work_group_size * sizeof(double);
     prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
 
-    toOrder = prec->getToOrder();
-    fromOrder = prec->getFromOrder();
-    rmat = prec->getRMat();
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        rmat = mat.get();
+    } else {
+        toOrder = prec->getToOrder();
+        fromOrder = prec->getFromOrder();
+        rmat = prec->getRMat();
+    }
 
     if (verbosity > 2) {
         std::ostringstream out;
@@ -645,7 +657,11 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
     Timer t;
 
     mat->nnzValues = vals;
-    reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
+    } else {
+        rb = b;
+    }
 
     if (verbosity > 2) {
         std::ostringstream out;
@@ -703,8 +719,12 @@ template <unsigned int block_size>
 void openclSolverBackend<block_size>::get_result(double *x) {
     Timer t;
 
-    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
-    reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
+        reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
+    } else {
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
+    }
 
     if (verbosity > 2) {
         std::ostringstream out;
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index 1c36fefcb..1a8dfa73b 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
     using Base::initialized;
 
 private:
-    double *rb = nullptr;                 // reordered b vector, the matrix is reordered, so b must also be
+    double *rb = nullptr;                 // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
     double *vals_contiguous = nullptr;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
 
     // OpenCL variables must be reusable, they are initialized in initialize()
@@ -59,7 +59,7 @@ private:
     cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p;   // vectors, used during linear solve
     cl::Buffer d_pw, d_s, d_t, d_v;              // vectors, used during linear solve
     cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
-    cl::Buffer d_toOrder;
+    cl::Buffer d_toOrder;                        // only used when reordering is used
     double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()
 
     // shared pointers are also passed to other objects
@@ -81,7 +81,8 @@ private:
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
     bool analysis_done = false;
     std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
-    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix, used for spmv
+    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix (or original if no reordering), used for spmv
+    ILUReorder opencl_ilu_reorder;                                // reordering strategy
 
     /// Divide A by B, and round up: return (int)ceil(A/B)
     /// \param[in] A    dividend

From 0f9ae3695d90f44476582eeab09fb2a7d8f1779c Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Wed, 13 Jan 2021 11:54:40 +0100
Subject: [PATCH 04/14] Made reordering optional for WellContributions Removed
 unnecessary alloc

---
 opm/simulators/linalg/bda/BILU0.cpp           |  3 -
 .../linalg/bda/WellContributions.cpp          | 45 ++++++------
 .../linalg/bda/WellContributions.hpp          | 16 +++-
 opm/simulators/linalg/bda/openclKernels.hpp   | 73 +++++++++++++++++++
 .../linalg/bda/openclSolverBackend.cpp        | 17 +++--
 .../linalg/bda/openclSolverBackend.hpp        |  7 +-
 6 files changed, 124 insertions(+), 37 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index 165106b50..e5f95ac87 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -57,7 +57,6 @@ namespace bda
             delete[] toOrder;
             delete[] fromOrder;
         }
-        delete[] LUmat->nnzValues;
     }
 
     template <unsigned int block_size>
@@ -130,8 +129,6 @@ namespace bda
         Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
         Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
 
-        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
-
         s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
         s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
         s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index 3eca058af..f17c53c3b 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -56,11 +56,6 @@ WellContributions::~WellContributions()
 
     if(num_std_wells > 0){
         delete[] val_pointers;
-#if HAVE_OPENCL
-        if(opencl_gpu){
-            delete[] h_toOrder;
-        }
-#endif
     }
 
 #if HAVE_OPENCL
@@ -79,8 +74,15 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
     this->queue = queue_;
 }
 
-void WellContributions::setKernel(kernel_type *kernel_){
+void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
     this->kernel = kernel_;
+    this->kernel_no_reorder = kernel_no_reorder_;
+}
+
+void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
+{
+    this->h_toOrder = h_toOrder_;
+    this->reorder = reorder_;
 }
 
 void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
@@ -90,29 +92,24 @@ void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffe
     const unsigned int lmem2 = sizeof(double) * dim_wells;
 
     cl::Event event;
-    event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                      *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
-                      cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+    if (reorder) {
+        event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                          *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
+                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+    } else {
+        event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                          *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl,
+                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+    }
+    event.wait();
 }
 
-void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
     if(h_x == nullptr){
         h_x = new double[N];
         h_y = new double[N];
     }
 
-    if(h_toOrder == nullptr){
-        h_toOrder = new int[Nb];
-    }
-
-    if(!read_toOrder){
-        events.resize(1);
-        queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]);
-        events[0].wait();
-        events.clear();
-        read_toOrder = true;
-    }
-
     events.resize(2);
     queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
     queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
@@ -121,7 +118,7 @@ void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer
 
     // actually apply MultisegmentWells
     for(Opm::MultisegmentWellContribution *well: multisegments){
-        well->setReordering(h_toOrder, true);
+        well->setReordering(h_toOrder, reorder);
         well->apply(h_x, h_y);
     }
 
@@ -138,7 +135,7 @@ void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrd
     }
 
     if(num_ms_wells > 0){
-        apply_mswells(d_x, d_y, d_toOrder);
+        apply_mswells(d_x, d_y);
     }
 }
 #endif
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index 7267fe1f0..f3f8e3fd2 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -95,17 +95,21 @@ private:
     typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                             cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
                             cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
+    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                            const unsigned int, const unsigned int, cl::Buffer&,
+                            cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
 
     cl::Context *context;
     cl::CommandQueue *queue;
     kernel_type *kernel;
+    kernel_type_no_reorder *kernel_no_reorder;
     std::vector<cl::Event> events;
 
     std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
     std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
     std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
 
-    bool read_toOrder = false;
+    bool reorder = false;
     int *h_toOrder = nullptr;
 #endif
 
@@ -150,10 +154,16 @@ public:
 #endif
 
 #if HAVE_OPENCL
-    void setKernel(kernel_type *kernel_);
+    void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
     void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
+
+    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether reordering is actually used or not
+    void setReordering(int *toOrder, bool reorder);
     void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
-    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
     void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
 #endif
 
diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp
index 021905e18..e3ffadf60 100644
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -440,6 +440,79 @@ namespace bda
     }
     )";
 
+    inline const char* stdwell_apply_no_reorder_s = R"(
+    __kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
+                                __global const double *Dnnzs,
+                                __global const double *Bnnzs,
+                                __global const int *Ccols,
+                                __global const int *Bcols,
+                                __global const double *x,
+                                __global double *y,
+                                const unsigned int dim,
+                                const unsigned int dim_wells,
+                                __global const unsigned int *val_pointers,
+                                __local double *localSum,
+                                __local double *z1,
+                                __local double *z2){
+        int wgId = get_group_id(0);
+        int wiId = get_local_id(0);
+        int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+        int valsPerBlock = dim*dim_wells;
+        int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
+        int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
+        int c = wiId % dim;
+        int r = (wiId/dim) % dim_wells;
+        double temp;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        localSum[wiId] = 0;
+        if(wiId < numActiveWorkItems){
+            int b = wiId/valsPerBlock + val_pointers[wgId];
+            while(b < valSize + val_pointers[wgId]){
+                int colIdx = Bcols[b];
+                localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
+                b += numBlocksPerWarp;
+            }
+
+            if(wiId < valsPerBlock){
+                localSum[wiId] += localSum[wiId + valsPerBlock];
+            }
+
+            b = wiId/valsPerBlock + val_pointers[wgId];
+
+            if(c == 0 && wiId < valsPerBlock){
+                for(unsigned int stride = 2; stride > 0; stride >>= 1){
+                    localSum[wiId] += localSum[wiId + stride];
+                }
+                z1[r] = localSum[wiId];
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if(wiId < dim_wells){
+            temp = 0.0;
+            for(unsigned int i = 0; i < dim_wells; ++i){
+                temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
+            }
+            z2[wiId] = temp;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if(wiId < dim*valSize){
+            temp = 0.0;
+            int bb = wiId/dim + val_pointers[wgId];
+            for (unsigned int j = 0; j < dim_wells; ++j){
+                temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
+            }
+            int colIdx = Ccols[bb];
+            y[colIdx*dim + c] -= temp;
+        }
+    }
+    )";
+
 } // end namespace bda
 
 #endif
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index 3c7aad43a..5e10ad8ec 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -20,7 +20,6 @@
 #include <config.h>
 #include <cmath>
 #include <sstream>
-#include <iostream>
 
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
@@ -318,7 +317,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
     double norm, norm_0;
 
     if(wellContribs.getNumWells() > 0){
-        wellContribs.setKernel(stdwell_apply_k.get());
+        wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
     }
 
     Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
@@ -479,6 +478,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
         source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
         source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
+        source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
         program = cl::Program(*context, source);
         program.build(devices);
 
@@ -512,7 +512,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
             rb = new double[N];
             d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
         }
-        wcontainer->init(wellContribs, N, Nb, reorder);
 
         // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
         // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -529,6 +528,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
                                                   cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                   const unsigned int, const unsigned int, cl::Buffer&,
                                                   cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
+        stdwell_apply_no_reorder_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                  const unsigned int, const unsigned int, cl::Buffer&,
+                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
 
         prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
 
@@ -653,14 +656,16 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
+void openclSolverBackend<block_size>::update_system(double *vals, double *b, WellContributions &wellContribs) {
     Timer t;
 
     mat->nnzValues = vals;
     if (opencl_ilu_reorder != ILUReorder::NONE) {
         reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
+        wellContribs.setReordering(toOrder, true);
     } else {
         rb = b;
+        wellContribs.setReordering(nullptr, false);
     }
 
     if (verbosity > 2) {
@@ -743,13 +748,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
                 return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
             }
         }
-        update_system(vals, b);
+        update_system(vals, b, wellContribs);
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
         copy_system_to_gpu();
     } else {
-        update_system(vals, b);
+        update_system(vals, b, wellContribs);
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index 1a8dfa73b..aae527643 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -76,6 +76,10 @@ private:
                                     cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                     const unsigned int, const unsigned int, cl::Buffer&,
                                     cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    const unsigned int, const unsigned int, cl::Buffer&,
+                                    cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
 
     Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
@@ -152,7 +156,8 @@ private:
     /// Reorder the linear system so it corresponds with the coloring
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
     /// \param[in] b              input vectors, contains N values
-    void update_system(double *vals, double *b);
+    /// \param[out] wellContribs  WellContributions, to set reordering
+    void update_system(double *vals, double *b, WellContributions &wellContribs);
 
     /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
     void update_system_on_gpu();

From a8e524fc9d97ddd6bed18551e08e1ce2418bce46 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Mon, 18 Jan 2021 14:11:07 +0100
Subject: [PATCH 05/14] Minor C to C++ changes

---
 opm/simulators/linalg/bda/BILU0.cpp | 33 +++++++++++++----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index e5f95ac87..40449cb52 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -167,7 +167,6 @@ namespace bda
         // split matrix into L and U
         // also convert U into BSC format (Ut)
         // Ut stores diagonal for now
-        int num_blocks_L = 0;
 
         // Ut is actually BSC format
         std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Umat->Nb, Umat->nnzbs + Umat->Nb);
@@ -177,6 +176,7 @@ namespace bda
             Ut->rowPointers[i] = 0;
         }
 
+        int num_blocks_L = 0;
         // for every row
         for (int i = 0; i < Nb; i++) {
             int iRowStart = LUmat->rowPointers[i];
@@ -188,19 +188,16 @@ namespace bda
                     Ut->rowPointers[j+1]++;   // actually colPointers
                 } else {
                     Lmat->colIndices[num_blocks_L] = j;
-                    memcpy(Lmat->nnzValues + num_blocks_L * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
+                    std::copy(LUmat->nnzValues + ij * bs * bs, LUmat->nnzValues + (ij+1) * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
                     num_blocks_L++;
                 }
             }
+            // TODO: copy all blocks for L at once, instead of copying each block individually
             Lmat->rowPointers[i+1] = num_blocks_L;
         }
 
         // prefix sum
-        int sum = 0;
-        for (int i = 1; i < Nb+1; i++) {
-            sum += Ut->rowPointers[i];
-            Ut->rowPointers[i] = sum;
-        }
+        std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
 
         // for every row
         for (int i = 0; i < Nb; i++) {
@@ -317,13 +314,17 @@ namespace bda
 
                 }
             }
-            double *t = Lmat->nnzValues;
-            Lmat->nnzValues = Ltmp;
-            Ltmp = t;
-            t = Ut->nnzValues;
-            Ut->nnzValues = Utmp;
-            Utmp = t;
+            // 1st sweep writes to Ltmp
+            // 2nd sweep writes to Lmat->nnzValues
+            std::swap(Lmat->nnzValues, Ltmp);
+            std::swap(Ut->nnzValues, Utmp);
         } // end sweep
+
+        // if number of sweeps is odd, swap again so data is in Lmat->nnzValues
+        if (num_sweeps % 2 == 1) {
+            std::swap(Lmat->nnzValues, Ltmp);
+            std::swap(Ut->nnzValues, Utmp);
+        }
         delete[] Ltmp;
 #endif
 
@@ -346,11 +347,7 @@ namespace bda
         }
 
         // prefix sum
-        int sumU = 0;
-        for (int i = 1; i < Nb+1; i++) {
-            sumU += ptr[i];
-            ptr[i] = sumU;
-        }
+        std::partial_sum(ptr.begin(), ptr.end(), ptr.begin());
 
         // actually copy nonzero values for U
         for(int i = 0; i < Nb; ++i) {

From 123e3fa89e653ccce5b5382d40a947b230badfbd Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Mon, 18 Jan 2021 17:10:46 +0100
Subject: [PATCH 06/14] Use std::find and added comments

---
 opm/simulators/linalg/bda/BILU0.cpp | 94 +++++++++++++++++------------
 1 file changed, 57 insertions(+), 37 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index 40449cb52..b79964d65 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -238,80 +238,100 @@ namespace bda
         double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
         for (int sweep = 0; sweep < num_sweeps; ++sweep) {
 
+            // algorithm
+            // for every block in A (LUmat):
+            //     if i > j:
+            //         Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
+            //     else:
+            //         Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
+
             // for every row
             for (int row = 0; row < Nb; row++) {
                 // update U
+                // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
                 int jColStart = Ut->rowPointers[row];
                 int jColEnd = Ut->rowPointers[row + 1];
+                int colU = row; // rename for clarity, next row in Ut means next col in U
                 // for every block in this row
                 for (int ij = jColStart; ij < jColEnd; ij++) {
-                    int col = Ut->colIndices[ij];
+                    int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U
                     // refine Uij element (or diagonal)
-                    int i1 = LUmat->rowPointers[col];
-                    int i2 = LUmat->rowPointers[col+1];
-                    int kk = 0;
-                    for(kk = i1; kk < i2; ++kk) {
-                        ptrdiff_t c = LUmat->colIndices[kk];
-                        if (c >= row) {
-                            break;
-                        }
-                    }
+                    int i1 = LUmat->rowPointers[rowU1];
+                    int i2 = LUmat->rowPointers[rowU1+1];
+
+                    // search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij
+                    // LUmat->nnzValues[kk] is block Aij
+                    auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU);
+                    assert(candidate != LUmat->colIndices + i2);
+                    auto kk = candidate - LUmat->colIndices;
+
                     double aij[bs*bs];
+                    // copy block to Aij so operations can be done on it without affecting LUmat
                     memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
-                    int jk = Lmat->rowPointers[col];
-                    int ik = (jk < Lmat->rowPointers[col+1]) ? Lmat->colIndices[jk] : Nb;
 
-                    for (int k = jColStart; k < ij; ++k) {
-                        int ki = Ut->colIndices[k];
-                        while (ik < ki) {
-                            ++jk;
-                            ik = Lmat->colIndices[jk];
-                        }
-                        if (ik == ki) {
-                            blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
+                    int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L
+                    // if row rowU1 is empty, skip row
+                    if (jk < Lmat->rowPointers[rowU1+1]) {
+                        int colL = Lmat->colIndices[jk];
+                        for (int k = jColStart; k < ij; ++k) {
+                            int rowU2 = Ut->colIndices[k];
+                            while (colL < rowU2) {
+                                ++jk; // check next block on row rowU1 of L
+                                colL = Lmat->colIndices[jk];
+                            }
+                            if (colL == rowU2) {
+                                // Aij -= (Lik * Ukj)
+                                blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
+                            }
                         }
                     }
 
+                    // Uij_new = Aij - sum
                     memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
                 }
 
                 // update L
+                // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
                 int iRowStart = Lmat->rowPointers[row];
                 int iRowEnd = Lmat->rowPointers[row + 1];
 
                 for (int ij = iRowStart; ij < iRowEnd; ij++) {
                     int j = Lmat->colIndices[ij];
                     // refine Lij element
+                    // search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij
+                    // LUmat->nnzValues[kk] is block Aij
                     int i1 = LUmat->rowPointers[row];
                     int i2 = LUmat->rowPointers[row+1];
-                    int kk = 0;
-                    for(kk = i1; kk < i2; ++kk) {
-                        ptrdiff_t c = LUmat->colIndices[kk];
-                        if (c >= j) {
-                            break;
-                        }
-                    }
+
+                    auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j);
+                    assert(candidate != LUmat->colIndices + i2);
+                    auto kk = candidate - LUmat->colIndices;
+
                     double aij[bs*bs];
+                    // copy block to Aij so operations can be done on it without affecting LUmat
                     memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
-                    int jk = Ut->rowPointers[j];
-                    int ik = Ut->colIndices[jk];
+
+                    int jk = Ut->rowPointers[j];  // actually colPointers, jk points to col j in U
+                    int rowU = Ut->colIndices[jk];  // actually rowIndices, rowU is the row of block jk
+                    // check if L has a matching block where colL == rowU
                     for (int k = iRowStart; k < ij; ++k) {
-                        int ki = Lmat->colIndices[k];
-                        while(ik < ki) {
-                            ++jk;
-                            ik = Ut->colIndices[jk];
+                        int colL = Lmat->colIndices[k];
+                        while (rowU < colL) {
+                            ++jk; // check next block on col j of U
+                            rowU = Ut->colIndices[jk];
                         }
 
-                        if(ik == ki) {
+                        if (rowU == colL) {
+                            // Aij -= (Lik * Ukj)
                             blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
                         }
                     }
-                    // calculate aij / ujj
+
+                    // calculate (Aij - sum) / Ujj
                     double ujj[bs*bs];
                     inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
-                    // lij = aij / ujj
+                    // Lij_new = (Aij - sum) / Ujj
                     blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
-
                 }
             }
             // 1st sweep writes to Ltmp

From 48ebef7808fbf60326a13cd03a1ede7dde16b81e Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Tue, 19 Jan 2021 11:06:39 +0100
Subject: [PATCH 07/14] Updated comments

---
 opm/simulators/linalg/bda/BILU0.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index b79964d65..b1b11ce01 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -185,7 +185,7 @@ namespace bda
             for (int ij = iRowStart; ij < iRowEnd; ij++) {
                 int j = LUmat->colIndices[ij];
                 if (i <= j) {
-                    Ut->rowPointers[j+1]++;   // actually colPointers
+                    Ut->rowPointers[j+1]++;   // actually colPointers, now simply indicates how many blocks this col holds
                 } else {
                     Lmat->colIndices[num_blocks_L] = j;
                     std::copy(LUmat->nnzValues + ij * bs * bs, LUmat->nnzValues + (ij+1) * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
@@ -196,7 +196,7 @@ namespace bda
             Lmat->rowPointers[i+1] = num_blocks_L;
         }
 
-        // prefix sum
+        // prefix sum to sum rowsizes into colpointers
         std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
 
         // for every row
@@ -207,7 +207,7 @@ namespace bda
             for (int ij = iRowStart; ij < iRowEnd; ij++) {
                 int j = LUmat->colIndices[ij];
                 if (i <= j){
-                    int idx = Ut->rowPointers[j]++;
+                    int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i
                     Ut->colIndices[idx] = i;     // actually rowIndices
                     memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
                 }
@@ -216,6 +216,7 @@ namespace bda
 
         // rotate
         // the Ut->rowPointers were increased in the last loop
+        // now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value
         for (int i = Nb; i > 0; --i) {
             Ut->rowPointers[i] = Ut->rowPointers[i-1];
         }
@@ -223,7 +224,8 @@ namespace bda
 
         Opm::Detail::Inverter<bs> inverter;
 
-        // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed at the end
+        // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
+        // U will be reversed because it is used with backwards substitution, the last row is used first
         // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
         double *Utmp = new double[Ut->nnzbs * block_size * block_size];
 
@@ -273,6 +275,7 @@ namespace bda
                     // if row rowU1 is empty, skip row
                     if (jk < Lmat->rowPointers[rowU1+1]) {
                         int colL = Lmat->colIndices[jk];
+                        // for every block on colU
                         for (int k = jColStart; k < ij; ++k) {
                             int rowU2 = Ut->colIndices[k];
                             while (colL < rowU2) {
@@ -385,7 +388,7 @@ namespace bda
         std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
         ptr.front() = 0;
 
-        // reversing the rows of U, because that is the order they are used in
+        // reversing the rows of U, because that is the order they are used in during ILU apply
         int URowIndex = 0;
         int offsetU = 0;   // number of nnz blocks that are already copied to Umat
         Umat->rowPointers[0] = 0;

From dba20adf0424d5f7dbace67086b6454025a61a19 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Wed, 20 Jan 2021 11:00:30 +0100
Subject: [PATCH 08/14] Bugfix: extra swap was done on odd number of sweeps

---
 opm/simulators/linalg/bda/BILU0.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index b1b11ce01..051f61fbf 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -343,8 +343,8 @@ namespace bda
             std::swap(Ut->nnzValues, Utmp);
         } // end sweep
 
-        // if number of sweeps is odd, swap again so data is in Lmat->nnzValues
-        if (num_sweeps % 2 == 1) {
+        // if number of sweeps is even, swap again so data is in Lmat->nnzValues
+        if (num_sweeps % 2 == 0) {
             std::swap(Lmat->nnzValues, Ltmp);
             std::swap(Ut->nnzValues, Utmp);
         }

From 1e09b1f4d90b3cff3e576e6b40d57df5ad162a1b Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Tue, 26 Jan 2021 13:45:40 +0100
Subject: [PATCH 09/14] Added comments and rewrote if

---
 opm/simulators/linalg/bda/BILU0.cpp  |   4 +-
 opm/simulators/linalg/bda/fgpilu.cpp | 173 +++++++++++++++++----------
 2 files changed, 113 insertions(+), 64 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index 051f61fbf..bdfdbe8b2 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -275,7 +275,7 @@ namespace bda
                     // if row rowU1 is empty, skip row
                     if (jk < Lmat->rowPointers[rowU1+1]) {
                         int colL = Lmat->colIndices[jk];
-                        // for every block on colU
+                        // only check until block U(i,j) is reached
                         for (int k = jColStart; k < ij; ++k) {
                             int rowU2 = Ut->colIndices[k];
                             while (colL < rowU2) {
@@ -316,7 +316,7 @@ namespace bda
 
                     int jk = Ut->rowPointers[j];  // actually colPointers, jk points to col j in U
                     int rowU = Ut->colIndices[jk];  // actually rowIndices, rowU is the row of block jk
-                    // check if L has a matching block where colL == rowU
+                    // only check until block L(i,j) is reached
                     for (int k = iRowStart; k < ij; ++k) {
                         int colL = Lmat->colIndices[k];
                         while (rowU < colL) {
diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp
index 508fcafe0..70226f44f 100644
--- a/opm/simulators/linalg/bda/fgpilu.cpp
+++ b/opm/simulators/linalg/bda/fgpilu.cpp
@@ -45,6 +45,7 @@ namespace bda
 // PARALLEL 1 should be run with at least 32 workitems per workgroup.
 // The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
 // PARALLEL 0 is generally faster, despite not having parallelization.
+// only 3x3 blocks are supported
 
 #define PARALLEL 0
 
@@ -54,6 +55,8 @@ inline const char* fgpilu_sweep_s  = R"(
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
+// subtract blocks: a = a - b * c
+// the output block has 9 entries, each entry is calculated by 1 thread
 void blockMultSub(
     __local double * restrict a,
     __global const double * restrict b,
@@ -74,7 +77,8 @@ void blockMultSub(
     }
 }
 
-
+// multiply blocks: resMat = mat1 * mat2
+// the output block has 9 entries, each entry is calculated by 1 thread
 void blockMult(
     __local const double * restrict mat1,
     __local const double * restrict mat2,
@@ -95,13 +99,14 @@ void blockMult(
     }
 }
 
-
+// invert block: inverse = matrix^{-1}
+// the output block has 9 entries, each entry is calculated by 1 thread
 void invert(
     __global const double * restrict matrix,
     __local double * restrict inverse)
 {
     const unsigned int block_size = 3;
-    const unsigned int bs = block_size;
+    const unsigned int bs = block_size;                           // rename to shorter name
     const unsigned int warp_size = 32;
     const unsigned int idx_t = get_local_id(0);                   // thread id in work group
     const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
@@ -128,6 +133,10 @@ void invert(
     }
 }
 
+// perform the fixed-point iteration
+// all entries in L and U are updated once
+// output is written to [LU]tmp
+// aij and ujj are local arrays whose size is specified before kernel launch
 __kernel void fgpilu_sweep(
     __global const double * restrict Ut_vals,
     __global const double * restrict L_vals,
@@ -145,26 +154,30 @@ __kernel void fgpilu_sweep(
     __local double *ujj)
 {
     const int bs = 3;
-
-    // for every row
     const unsigned int warp_size = 32;
-    const unsigned int bsize = get_local_size(0);
-    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int work_group_size = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / work_group_size;
     const unsigned int num_groups = get_num_groups(0);
-    const unsigned int warps_per_group = bsize / warp_size;
+    const unsigned int warps_per_group = work_group_size / warp_size;
     const unsigned int idx_t = get_local_id(0);                   // thread id in work group
     const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
     const unsigned int warp_id_in_group = idx_t / warp_size;
-    const unsigned int lmem_offset = warp_id_in_group * bs * bs;            // each workgroup gets some lmem, but the workitems have to share it
+    const unsigned int lmem_offset = warp_id_in_group * bs * bs;  // each workgroup gets some lmem, but the workitems have to share it
+                                                                  // every workitem in a warp has the same lmem_offset
+
+    // for every row of L or every col of U
     for (int row = idx_b; row < Nb; row+=num_groups) {
-        int jColStart = Ut_rows[row];
+        // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
+        int jColStart = Ut_rows[row];    // actually colPointers to U
         int jColEnd = Ut_rows[row + 1];
+        // for every block on this column
         for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) {
-            int col = Ut_cols[ij];
+            int rowU1 = Ut_cols[ij]; // actually rowIndices for U
             // refine Uij element (or diagonal)
-            int i1 = LU_rows[col];
-            int i2 = LU_rows[col+1];
+            int i1 = LU_rows[rowU1];
+            int i2 = LU_rows[rowU1+1];
             int kk = 0;
+            // LUmat->nnzValues[kk] is block Aij
             for(kk = i1; kk < i2; ++kk) {
                 int c = LU_cols[kk];
                 if (c >= row) {
@@ -172,40 +185,48 @@ __kernel void fgpilu_sweep(
                 }
             }
 
+            // copy block Aij so operations can be done on it without affecting LUmat
             if(thread_id_in_warp < bs*bs){
                 aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
             }
 
-            int jk = L_rows[col];
-            int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
-
-            for (int k = jColStart; k < ij; ++k) {
-                int ki = Ut_cols[k];
-                while (ik < ki) {
-                    ++jk;
-                    ik = L_cols[jk];
-                }
-                if (ik == ki) {
-                    blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
+            int jk = L_rows[rowU1]; // points to row rowU1 in L
+            // if row rowU1 is empty: skip row. The whole warp looks at the same row, so no divergence
+            if (jk < L_rows[rowU1+1]) {
+                int colL = L_cols[jk];
+                // only check until block U(i,j) is reached
+                for (int k = jColStart; k < ij; ++k) {
+                    int rowU2 = Ut_cols[k]; // actually rowIndices for U
+                    while (colL < rowU2) {
+                        ++jk; // check next block on row rowU1 of L
+                        colL = L_cols[jk];
+                    }
+                    if (colL == rowU2) {
+                        // Aij -= (Lik * Ukj)
+                        blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
+                    }
                 }
             }
 
+            // Uij_new = Aij - sum
+            // write result of this sweep
             if(thread_id_in_warp < bs*bs){
                 Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp];
             }
         }
 
         // update L
+        // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
         int iRowStart = L_rows[row];
         int iRowEnd = L_rows[row + 1];
 
         for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) {
-        // for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
             int j = L_cols[ij];
             // // refine Lij element
             int i1 = LU_rows[row];
             int i2 = LU_rows[row+1];
             int kk = 0;
+            // LUmat->nnzValues[kk] is block Aij
             for(kk = i1; kk < i2; ++kk) {
                 int c = LU_cols[kk];
                 if (c >= j) {
@@ -213,28 +234,32 @@ __kernel void fgpilu_sweep(
                 }
             }
 
+            // copy block Aij so operations can be done on it without affecting LUmat
             if(thread_id_in_warp < bs*bs){
                 aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
             }
 
-            int jk = Ut_rows[j];
-            int ik = Ut_cols[jk];
+            int jk = Ut_rows[j];    // actually colPointers, jk points to col j in U
+            int rowU = Ut_cols[jk]; // actually rowIndices, rowU is the row of block jk
+            // only check until block L(i,j) is reached
             for (int k = iRowStart; k < ij; ++k) {
-                int ki = L_cols[k];
-                while(ik < ki) {
-                    ++jk;
-                    ik = Ut_cols[jk];
+                int colL = L_cols[k];
+                while(rowU < colL) {
+                    ++jk; // check next block on col j of U
+                    rowU = Ut_cols[jk];
                 }
 
-                if(ik == ki) {
+                if(rowU == colL) {
+                    // Aij -= (Lik * Ukj)
                     blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
                 }
             }
 
-            // calculate aij / ujj
+            // calculate 1 / Ujj
             invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset);
 
-            // lij = aij / ujj
+            // Lij_new = (Aij - sum) / Ujj
+            // write result of this sweep
             blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs);
         }
     }
@@ -247,6 +272,8 @@ inline const char* fgpilu_sweep_s  = R"(
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
+// subtract blocks: a = a - b * c
+// only one workitem performs this action
 void blockMultSub(
     __local double * restrict a,
     __global const double * restrict b,
@@ -264,7 +291,8 @@ void blockMultSub(
     }
 }
 
-
+// multiply blocks: resMat = mat1 * mat2
+// only one workitem performs this action
 void blockMult(
     __local const double * restrict mat1,
     __local const double * restrict mat2,
@@ -282,7 +310,8 @@ void blockMult(
     }
 }
 
-
+// invert block: inverse = matrix^{-1}
+// only one workitem performs this action
 __kernel void inverter(
     __global const double * restrict matrix,
     __local double * restrict inverse)
@@ -310,6 +339,10 @@ __kernel void inverter(
     inverse[8] =  (t4 - t8) * t17;
 }
 
+// perform the fixed-point iteration
+// all entries in L and U are updated once
+// output is written to [LU]tmp
+// aij and ujj are local arrays whose size is specified before kernel launch
 __kernel void fgpilu_sweep(
     __global const double * restrict Ut_vals,
     __global const double * restrict L_vals,
@@ -328,24 +361,28 @@ __kernel void fgpilu_sweep(
 {
     const int bs = 3;
 
-    // for every row
     const unsigned int warp_size = 32;
-    const unsigned int bsize = get_local_size(0);
-    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int work_group_size = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / work_group_size;
     const unsigned int num_groups = get_num_groups(0);
-    const unsigned int warps_per_group = bsize / warp_size;
+    const unsigned int warps_per_group = work_group_size / warp_size;
     const unsigned int idx_t = get_local_id(0);                   // thread id in work group
     const unsigned int thread_id_in_warp = idx_t % warp_size;     // thread id in warp (32 threads)
     const unsigned int warp_id_in_group = idx_t / warp_size;
+
+    // for every row of L or every col of U
     for (int row = idx_b; row < Nb; row+=num_groups) {
-        int jColStart = Ut_rows[row];
+        // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
+        int jColStart = Ut_rows[row];    // actually colPointers to U
         int jColEnd = Ut_rows[row + 1];
-        for (int ij = jColStart + idx_t; ij < jColEnd; ij+=bsize) {
-            int col = Ut_cols[ij];
+        // for every block on this column
+        for (int ij = jColStart + idx_t; ij < jColEnd; ij+=work_group_size) {
+            int rowU1 = Ut_cols[ij]; // actually rowIndices for U
             // refine Uij element (or diagonal)
-            int i1 = LU_rows[col];
-            int i2 = LU_rows[col+1];
+            int i1 = LU_rows[rowU1];
+            int i2 = LU_rows[rowU1+1];
             int kk = 0;
+            // LUmat->nnzValues[kk] is block Aij
             for(kk = i1; kk < i2; ++kk) {
                 int c = LU_cols[kk];
                 if (c >= row) {
@@ -353,39 +390,47 @@ __kernel void fgpilu_sweep(
                 }
             }
 
+            // copy block Aij so operations can be done on it without affecting LUmat
             for(int z = 0; z < bs*bs; ++z){
                 aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
             }
 
-            int jk = L_rows[col];
-            int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
+            int jk = L_rows[rowU1];
+            // if row rowU1 is empty: do not sum. The workitems have different rowU1 values, divergence is possible
+            int colL = (jk < L_rows[rowU1+1]) ? L_cols[jk] : Nb;
 
+            // only check until block U(i,j) is reached
             for (int k = jColStart; k < ij; ++k) {
-                int ki = Ut_cols[k];
-                while (ik < ki) {
-                    ++jk;
-                    ik = L_cols[jk];
+                int rowU2 = Ut_cols[k]; // actually rowIndices for U
+                while (colL < rowU2) {
+                    ++jk; // check next block on row rowU1 of L
+                    colL = L_cols[jk];
                 }
-                if (ik == ki) {
+                if (colL == rowU2) {
+                    // Aij -= (Lik * Ukj)
                     blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
                 }
             }
 
+            // Uij_new = Aij - sum
+            // write result of this sweep
             for(int z = 0; z < bs*bs; ++z){
                 Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z];
             }
         }
 
         // update L
+        // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
         int iRowStart = L_rows[row];
         int iRowEnd = L_rows[row + 1];
 
-        for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
+        for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=work_group_size) {
             int j = L_cols[ij];
             // // refine Lij element
             int i1 = LU_rows[row];
             int i2 = LU_rows[row+1];
             int kk = 0;
+            // LUmat->nnzValues[kk] is block Aij
             for(kk = i1; kk < i2; ++kk) {
                 int c = LU_cols[kk];
                 if (c >= j) {
@@ -393,27 +438,31 @@ __kernel void fgpilu_sweep(
                 }
             }
 
+            // copy block Aij so operations can be done on it without affecting LUmat
             for(int z = 0; z < bs*bs; ++z){
                 aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
             }
 
-            int jk = Ut_rows[j];
-            int ik = Ut_cols[jk];
+            int jk = Ut_rows[j];    // actually colPointers, jk points to col j in U
+            int rowU = Ut_cols[jk]; // actually rowIndices, rowU is the row of block jk
+            // only check until block L(i,j) is reached
             for (int k = iRowStart; k < ij; ++k) {
-                int ki = L_cols[k];
-                while(ik < ki) {
-                    ++jk;
-                    ik = Ut_cols[jk];
+                int colL = L_cols[k];
+                while(rowU < colL) {
+                    ++jk; // check next block on col j of U
+                    rowU = Ut_cols[jk];
                 }
 
-                if(ik == ki) {
+                if(rowU == colL) {
+                    // Aij -= (Lik * Ukj)
                     blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
                 }
             }
-            // calculate aij / ujj
+            // calculate 1 / ujj
             inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs);
 
-            // lij = aij / ujj
+            // Lij_new = (Aij - sum) / Ujj
+            // write result of this sweep
             blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs);
         }
     }

From 837a83fc16b601d4abc68aa859e58c9abdf04430 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Tue, 26 Jan 2021 17:32:18 +0100
Subject: [PATCH 10/14] Added error checking and std::call_once

---
 opm/simulators/linalg/bda/fgpilu.cpp | 13 +++++++++----
 opm/simulators/linalg/bda/fgpilu.hpp |  6 +++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp
index 70226f44f..d57ca981a 100644
--- a/opm/simulators/linalg/bda/fgpilu.cpp
+++ b/opm/simulators/linalg/bda/fgpilu.cpp
@@ -487,9 +487,13 @@ void FGPILU::decomposition(
     const int block_size = 3;
 
     try {
-        if (initialized == false) {
+        // just put everything in the capture list
+        std::call_once(initialize_flag, [&](){
             cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
             cl::Program program = cl::Program(*context, source, &err);
+            if (err != CL_SUCCESS) {
+                OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Program");
+            }
 
             std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
             program.build(devices);
@@ -499,6 +503,9 @@ void FGPILU::decomposition(
                                                      cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                      cl::Buffer&, cl::Buffer&,
                                                      const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "fgpilu_sweep", &err)));
+            if (err != CL_SUCCESS) {
+                OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Kernel");
+            }
 
             // allocate GPU memory
             d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
@@ -530,9 +537,7 @@ void FGPILU::decomposition(
                 out << "FGPILU PARALLEL: " << PARALLEL;
                 OpmLog::info(out.str());
             }
-
-            initialized = true;
-        }
+        });
 
 
         // copy to GPU
diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/fgpilu.hpp
index d51c21061..880db43b2 100644
--- a/opm/simulators/linalg/bda/fgpilu.hpp
+++ b/opm/simulators/linalg/bda/fgpilu.hpp
@@ -21,8 +21,9 @@
 #define FGPILU_HEADER_INCLUDED
 
 
-#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <mutex>
 
+#include <opm/simulators/linalg/bda/opencl.hpp>
 
 
 namespace bda
@@ -43,6 +44,7 @@ namespace bda
 
         cl::Event event;
         cl_int err;
+        std::once_flag initialize_flag;
 
         std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                         cl::Buffer&, cl::Buffer&, cl::Buffer&,
@@ -50,8 +52,6 @@ namespace bda
                                         cl::Buffer&, cl::Buffer&,
                                         const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > fgpilu_sweep_k;
 
-        bool initialized = false;
-
     public:
         /// Executes the FGPILU sweeps
         /// also copies data from CPU to GPU and GPU to CPU

From d222dffcfdb7d37284007d6398caedae5f5eaf7c Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Thu, 28 Jan 2021 13:46:01 +0100
Subject: [PATCH 11/14] Improved initial guess of L

---
 opm/simulators/linalg/bda/BILU0.cpp | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index bdfdbe8b2..d66ea05ab 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -169,15 +169,32 @@ namespace bda
         // Ut stores diagonal for now
 
         // Ut is actually BSC format
-        std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Umat->Nb, Umat->nnzbs + Umat->Nb);
+        std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
 
         Lmat->rowPointers[0] = 0;
         for (int i = 0; i < Nb+1; i++) {
             Ut->rowPointers[i] = 0;
         }
 
+        Opm::Detail::Inverter<bs> inverter;
+
+        // store inverted diagonal
+        for (int i = 0; i < Nb; i++) {
+            int iRowStart = LUmat->rowPointers[i];
+            int iRowEnd = LUmat->rowPointers[i + 1];
+            // for every block in this row
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUmat->colIndices[ij];
+                if (i == j) {
+                    inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
+                }
+            }
+        }
+
+        // initialize initial guess for L: L_A * D
+        // L_A is strictly lower triangular part of A
+        // D is inv(diag(A))
         int num_blocks_L = 0;
-        // for every row
         for (int i = 0; i < Nb; i++) {
             int iRowStart = LUmat->rowPointers[i];
             int iRowEnd = LUmat->rowPointers[i + 1];
@@ -188,7 +205,8 @@ namespace bda
                     Ut->rowPointers[j+1]++;   // actually colPointers, now simply indicates how many blocks this col holds
                 } else {
                     Lmat->colIndices[num_blocks_L] = j;
-                    std::copy(LUmat->nnzValues + ij * bs * bs, LUmat->nnzValues + (ij+1) * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
+                    // multiply block of L with corresponding diag block
+                    blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
                     num_blocks_L++;
                 }
             }
@@ -199,7 +217,7 @@ namespace bda
         // prefix sum to sum rowsizes into colpointers
         std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
 
-        // for every row
+        // initialize initial guess for U
         for (int i = 0; i < Nb; i++) {
             int iRowStart = LUmat->rowPointers[i];
             int iRowEnd = LUmat->rowPointers[i + 1];
@@ -222,7 +240,6 @@ namespace bda
         }
         Ut->rowPointers[0] = 0;
 
-        Opm::Detail::Inverter<bs> inverter;
 
         // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
         // U will be reversed because it is used with backwards substitution, the last row is used first

From 363cc3131625a4a8130c549c8650e91147c8f891 Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Thu, 28 Jan 2021 15:40:10 +0100
Subject: [PATCH 12/14] Made waiting for a GPUcopy more explicit

---
 opm/simulators/linalg/bda/fgpilu.cpp | 38 +++++++++++++++++-----------
 opm/simulators/linalg/bda/fgpilu.hpp |  1 +
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp
index d57ca981a..ea009c1cd 100644
--- a/opm/simulators/linalg/bda/fgpilu.cpp
+++ b/opm/simulators/linalg/bda/fgpilu.cpp
@@ -521,12 +521,15 @@ void FGPILU::decomposition(
             d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
 
             Dune::Timer t_copy_pattern;
-            err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_TRUE, 0, sizeof(int) * (Nb+1), Ut_ptrs);
-            err |= queue->enqueueWriteBuffer(d_L_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), L_rows);
-            err |= queue->enqueueWriteBuffer(d_LU_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), LU_rows);
-            err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_TRUE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs);
-            err |= queue->enqueueWriteBuffer(d_L_cols, CL_TRUE, 0, sizeof(int) * L_nnzbs, L_cols);
-            err |= queue->enqueueWriteBuffer(d_LU_cols, CL_TRUE, 0, sizeof(int) * LU_nnzbs, LU_cols);
+            events.resize(6);
+            err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_FALSE, 0, sizeof(int) * (Nb+1), Ut_ptrs, nullptr, &events[0]);
+            err |= queue->enqueueWriteBuffer(d_L_rows, CL_FALSE, 0, sizeof(int) * (Nb+1), L_rows, nullptr, &events[1]);
+            err |= queue->enqueueWriteBuffer(d_LU_rows, CL_FALSE, 0, sizeof(int) * (Nb+1), LU_rows, nullptr, &events[2]);
+            err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_FALSE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs, nullptr, &events[3]);
+            err |= queue->enqueueWriteBuffer(d_L_cols, CL_FALSE, 0, sizeof(int) * L_nnzbs, L_cols, nullptr, &events[4]);
+            err |= queue->enqueueWriteBuffer(d_LU_cols, CL_FALSE, 0, sizeof(int) * LU_nnzbs, LU_cols, nullptr, &events[5]);
+            cl::WaitForEvents(events);
+            events.clear();
             if (verbosity >= 3){
                 std::ostringstream out;
                 out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
@@ -542,9 +545,12 @@ void FGPILU::decomposition(
 
         // copy to GPU
         Dune::Timer t_copy1;
-        err = queue->enqueueWriteBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
-        err |= queue->enqueueWriteBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
-        err |= queue->enqueueWriteBuffer(d_LU_vals, CL_TRUE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals);
+        events.resize(3);
+        err = queue->enqueueWriteBuffer(d_Ut_vals, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]);
+        err |= queue->enqueueWriteBuffer(d_L_vals, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]);
+        err |= queue->enqueueWriteBuffer(d_LU_vals, CL_FALSE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals, nullptr, &events[2]);
+        cl::WaitForEvents(events);
+        events.clear();
         if (verbosity >= 3){
             std::ostringstream out;
             out << "FGPILU copy1 time: " << t_copy1.stop() << " s";
@@ -582,21 +588,23 @@ void FGPILU::decomposition(
             event.wait();
             if (verbosity >= 3){
                 std::ostringstream out;
-                out << "FGPILU sweep kernel time: " << t_copy1.stop() << " s";
+                out << "FGPILU sweep kernel time: " << t_kernel.stop() << " s";
                 OpmLog::info(out.str());
             }
         }
 
         // copy back
         Dune::Timer t_copy2;
+        events.resize(2);
         if (num_sweeps % 2 == 0) {
-            err = queue->enqueueReadBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
-            err |= queue->enqueueReadBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+            err = queue->enqueueReadBuffer(d_Ut_vals, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]);
+            err |= queue->enqueueReadBuffer(d_L_vals, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]);
         } else {
-            err = queue->enqueueReadBuffer(d_Utmp, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
-            err |= queue->enqueueReadBuffer(d_Ltmp, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
+            err = queue->enqueueReadBuffer(d_Utmp, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]);
+            err |= queue->enqueueReadBuffer(d_Ltmp, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]);
         }
-        err |= queue->enqueueBarrierWithWaitList();
+        cl::WaitForEvents(events);
+        events.clear();
         if (verbosity >= 3){
             std::ostringstream out;
             out << "FGPILU copy2 time: " << t_copy2.stop() << " s";
diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/fgpilu.hpp
index 880db43b2..b321e45b1 100644
--- a/opm/simulators/linalg/bda/fgpilu.hpp
+++ b/opm/simulators/linalg/bda/fgpilu.hpp
@@ -43,6 +43,7 @@ namespace bda
         cl::Buffer d_Ltmp, d_Utmp;
 
         cl::Event event;
+        std::vector<cl::Event> events;
         cl_int err;
         std::once_flag initialize_flag;
 

From a64a342104002640b160cf0427814bffa24c7b9a Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Mon, 1 Feb 2021 11:19:38 +0100
Subject: [PATCH 13/14] Added symmetry check in Debug mode

---
 opm/simulators/linalg/bda/BILU0.cpp | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index d66ea05ab..fc79ffed2 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -167,6 +167,34 @@ namespace bda
         // split matrix into L and U
         // also convert U into BSC format (Ut)
         // Ut stores diagonal for now
+        // original matrix LUmat is assumed to be symmetric
+
+#ifndef NDEBUG
+        // verify that matrix is symmetric
+        for (int i = 0; i < Nb; ++i){
+            int iRowStart = LUmat->rowPointers[i];
+            int iRowEnd = LUmat->rowPointers[i + 1];
+            // for every block (i, j) in this row, check if (j, i) also exists
+            for (int ij = iRowStart; ij < iRowEnd; ij++) {
+                int j = LUmat->colIndices[ij];
+                int jRowStart = LUmat->rowPointers[j];
+                int jRowEnd = LUmat->rowPointers[j + 1];
+                bool blockFound = false;
+                // check all blocks on row j
+                // binary search is possible
+                for (int ji = jRowStart; ji < jRowEnd; ji++) {
+                    int row = LUmat->colIndices[ji];
+                    if (i == row) {
+                        blockFound = true;
+                        break;
+                    }
+                }
+                if (false == blockFound) {
+                    OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()");
+                }
+            }
+        }
+#endif
 
         // Ut is actually BSC format
         std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);

From c8dca99fad615163b22f3857fde0c81b52fcdd1b Mon Sep 17 00:00:00 2001
From: tqiu <tqiu@bda-04.domain_not_set.invalid>
Date: Wed, 3 Feb 2021 17:43:54 +0100
Subject: [PATCH 14/14] Renamed fgpilu to ChowPatelIlu

---
 CMakeLists_files.cmake                        |  4 +-
 opm/simulators/linalg/bda/BILU0.cpp           |  4 +-
 opm/simulators/linalg/bda/BILU0.hpp           |  4 +-
 .../bda/{fgpilu.cpp => ChowPatelIlu.cpp}      | 39 +++++++++----------
 .../bda/{fgpilu.hpp => ChowPatelIlu.hpp}      | 12 +++---
 5 files changed, 31 insertions(+), 32 deletions(-)
 rename opm/simulators/linalg/bda/{fgpilu.cpp => ChowPatelIlu.cpp} (94%)
 rename opm/simulators/linalg/bda/{fgpilu.hpp => ChowPatelIlu.hpp} (93%)

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 70cb66a2a..5259d65b4 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -58,7 +58,7 @@ if(OPENCL_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/fgpilu.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
@@ -169,7 +169,7 @@ list (APPEND PUBLIC_HEADER_FILES
   opm/simulators/linalg/bda/BlockedMatrix.hpp
   opm/simulators/linalg/bda/cuda_header.hpp
   opm/simulators/linalg/bda/cusparseSolverBackend.hpp
-  opm/simulators/linalg/bda/fgpilu.hpp
+  opm/simulators/linalg/bda/ChowPatelIlu.hpp
   opm/simulators/linalg/bda/Reorder.hpp
   opm/simulators/linalg/bda/ILUReorder.hpp
   opm/simulators/linalg/bda/opencl.hpp
diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index fc79ffed2..991dc14b1 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -25,7 +25,7 @@
 
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/BILU0.hpp>
-#include <opm/simulators/linalg/bda/fgpilu.hpp>
+#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
 
 
@@ -276,7 +276,7 @@ namespace bda
 
         // actual ILU decomposition
 #if CHOW_PATEL_GPU
-        fgpilu.decomposition(queue, context,
+        chowPatelIlu.decomposition(queue, context,
                     Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
                     Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
                     LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp
index 5a31c57a9..67d945851 100644
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -24,7 +24,7 @@
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 
 #include <opm/simulators/linalg/bda/opencl.hpp>
-#include <opm/simulators/linalg/bda/fgpilu.hpp>
+#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
 
 namespace bda
 {
@@ -68,7 +68,7 @@ namespace bda
         int lmem_per_work_group = 0;
         bool pattern_uploaded = false;
 
-        FGPILU fgpilu;
+        ChowPatelIlu chowPatelIlu;
 
         void chow_patel_decomposition();
 
diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
similarity index 94%
rename from opm/simulators/linalg/bda/fgpilu.cpp
rename to opm/simulators/linalg/bda/ChowPatelIlu.cpp
index ea009c1cd..894b3f05c 100644
--- a/opm/simulators/linalg/bda/fgpilu.cpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
@@ -22,7 +22,7 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <dune/common/timer.hh>
 
-#include <opm/simulators/linalg/bda/fgpilu.hpp>
+#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
 
 namespace bda
 {
@@ -50,8 +50,7 @@ namespace bda
 #define PARALLEL 0
 
 #if PARALLEL
-
-inline const char* fgpilu_sweep_s  = R"(
+inline const char* chow_patel_ilu_sweep_s  = R"(
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
@@ -137,7 +136,7 @@ void invert(
 // all entries in L and U are updated once
 // output is written to [LU]tmp
 // aij and ujj are local arrays whose size is specified before kernel launch
-__kernel void fgpilu_sweep(
+__kernel void chow_patel_ilu_sweep(
     __global const double * restrict Ut_vals,
     __global const double * restrict L_vals,
     __global const double * restrict LU_vals,
@@ -268,7 +267,7 @@ __kernel void fgpilu_sweep(
 
 #else
 
-inline const char* fgpilu_sweep_s  = R"(
+inline const char* chow_patel_ilu_sweep_s  = R"(
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
@@ -343,7 +342,7 @@ __kernel void inverter(
 // all entries in L and U are updated once
 // output is written to [LU]tmp
 // aij and ujj are local arrays whose size is specified before kernel launch
-__kernel void fgpilu_sweep(
+__kernel void chow_patel_ilu_sweep(
     __global const double * restrict Ut_vals,
     __global const double * restrict L_vals,
     __global const double * restrict LU_vals,
@@ -477,7 +476,7 @@ __kernel void fgpilu_sweep(
 
 
 
-void FGPILU::decomposition(
+void ChowPatelIlu::decomposition(
     cl::CommandQueue *queue, cl::Context *context,
     int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
     int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
@@ -489,22 +488,22 @@ void FGPILU::decomposition(
     try {
         // just put everything in the capture list
         std::call_once(initialize_flag, [&](){
-            cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
+            cl::Program::Sources source(1, std::make_pair(chow_patel_ilu_sweep_s, strlen(chow_patel_ilu_sweep_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
             cl::Program program = cl::Program(*context, source, &err);
             if (err != CL_SUCCESS) {
-                OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Program");
+                OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL could not create Program");
             }
 
             std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
             program.build(devices);
 
-            fgpilu_sweep_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+            chow_patel_ilu_sweep_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                      cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                      cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                      cl::Buffer&, cl::Buffer&,
-                                                     const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "fgpilu_sweep", &err)));
+                                                     const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "chow_patel_ilu_sweep", &err)));
             if (err != CL_SUCCESS) {
-                OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Kernel");
+                OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL could not create Kernel");
             }
 
             // allocate GPU memory
@@ -532,12 +531,12 @@ void FGPILU::decomposition(
             events.clear();
             if (verbosity >= 3){
                 std::ostringstream out;
-                out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
+                out << "ChowPatelIlu copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
                 OpmLog::info(out.str());
             }
             if (verbosity >= 2){
                 std::ostringstream out;
-                out << "FGPILU PARALLEL: " << PARALLEL;
+                out << "ChowPatelIlu PARALLEL: " << PARALLEL;
                 OpmLog::info(out.str());
             }
         });
@@ -553,12 +552,12 @@ void FGPILU::decomposition(
         events.clear();
         if (verbosity >= 3){
             std::ostringstream out;
-            out << "FGPILU copy1 time: " << t_copy1.stop() << " s";
+            out << "ChowPatelIlu copy1 time: " << t_copy1.stop() << " s";
             OpmLog::info(out.str());
         }
         if (err != CL_SUCCESS) {
             // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueWriteBuffer error");
+            OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL enqueueWriteBuffer error");
         }
 
         // call kernel
@@ -580,7 +579,7 @@ void FGPILU::decomposition(
             int total_work_items = num_work_groups * work_group_size;
             int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double);
             Dune::Timer t_kernel;
-            event = (*fgpilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+            event = (*chow_patel_ilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
                 *Uarg1, *Larg1, d_LU_vals,
                 d_Ut_ptrs, d_L_rows, d_LU_rows,
                 d_Ut_idxs, d_L_cols, d_LU_cols,
@@ -588,7 +587,7 @@ void FGPILU::decomposition(
             event.wait();
             if (verbosity >= 3){
                 std::ostringstream out;
-                out << "FGPILU sweep kernel time: " << t_kernel.stop() << " s";
+                out << "ChowPatelIlu sweep kernel time: " << t_kernel.stop() << " s";
                 OpmLog::info(out.str());
             }
         }
@@ -607,12 +606,12 @@ void FGPILU::decomposition(
         events.clear();
         if (verbosity >= 3){
             std::ostringstream out;
-            out << "FGPILU copy2 time: " << t_copy2.stop() << " s";
+            out << "ChowPatelIlu copy2 time: " << t_copy2.stop() << " s";
             OpmLog::info(out.str());
         }
         if (err != CL_SUCCESS) {
             // enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueReadBuffer error");
+            OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL enqueueReadBuffer error");
         }
 
     } catch (const cl::Error& error) {
diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/ChowPatelIlu.hpp
similarity index 93%
rename from opm/simulators/linalg/bda/fgpilu.hpp
rename to opm/simulators/linalg/bda/ChowPatelIlu.hpp
index b321e45b1..ff58de996 100644
--- a/opm/simulators/linalg/bda/fgpilu.hpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.hpp
@@ -17,8 +17,8 @@
   along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#ifndef FGPILU_HEADER_INCLUDED
-#define FGPILU_HEADER_INCLUDED
+#ifndef CHOW_PATEL_ILU_HEADER_INCLUDED
+#define CHOW_PATEL_ILU_HEADER_INCLUDED
 
 
 #include <mutex>
@@ -33,7 +33,7 @@ namespace bda
     //     FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
     // only blocksize == 3 is supported
     // decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
-    class FGPILU
+    class ChowPatelIlu
     {
     private:
         cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
@@ -51,10 +51,10 @@ namespace bda
                                         cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                         cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                         cl::Buffer&, cl::Buffer&,
-                                        const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > fgpilu_sweep_k;
+                                        const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k;
 
     public:
-        /// Executes the FGPILU sweeps
+        /// Executes the ChowPatelIlu sweeps
         /// also copies data from CPU to GPU and GPU to CPU
         /// \param[in] queue         OpenCL commandqueue
         /// \param[in] context       OpenCL context
@@ -84,4 +84,4 @@ namespace bda
 
 } // end namespace bda
 
-#endif // FGPILU_HEADER_INCLUDED
+#endif // CHOW_PATEL_ILU_HEADER_INCLUDED