From e0a4d271ea1ea45ce19d959c92ef16e0f2555b62 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Tue, 30 Nov 2021 16:05:58 +0100
Subject: [PATCH] Formatting changes

---
 opm/simulators/linalg/bda/BILU0.cpp | 412 ++++++++++++++--------------
 opm/simulators/linalg/bda/BILU0.hpp | 120 ++++----
 2 files changed, 267 insertions(+), 265 deletions(-)
diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp
index 332797d50..c61db9de2 100644
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -47,6 +47,7 @@ BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
 #endif
 }
 
+
 template <unsigned int block_size>
 BILU0<block_size>::~BILU0()
 {
@@ -61,247 +62,248 @@ void BILU0<block_size>::init(int Nb, int nnzb, std::shared_ptr<cl::Context>& con
     queue = queue_.get();
 }
 
-    template <unsigned int block_size>
-    bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
-    {
-        const unsigned int bs = block_size;
 
-        this->N = mat->Nb * block_size;
-        this->Nb = mat->Nb;
-        this->nnz = mat->nnzbs * block_size * block_size;
-        this->nnzb = mat->nnzbs;
+template <unsigned int block_size>
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
+{
+    const unsigned int bs = block_size;
 
-        int *CSCRowIndices = nullptr;
-        int *CSCColPointers = nullptr;
+    this->N = mat->Nb * block_size;
+    this->Nb = mat->Nb;
+    this->nnz = mat->nnzbs * block_size * block_size;
+    this->nnzb = mat->nnzbs;
 
-        if (opencl_ilu_reorder == ILUReorder::NONE) {
-            LUmat = std::make_unique<BlockedMatrix>(*mat);
-        } else {
-            toOrder.resize(Nb);
-            fromOrder.resize(Nb);
-            CSCRowIndices = new int[nnzb];
-            CSCColPointers = new int[Nb + 1];
-            rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
-            LUmat = std::make_unique<BlockedMatrix>(*rmat);
+    int *CSCRowIndices = nullptr;
+    int *CSCColPointers = nullptr;
 
-            Timer t_convert;
-            csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
-            if(verbosity >= 3){
-                std::ostringstream out;
-                out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
-                OpmLog::info(out.str());
-            }
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        LUmat = std::make_unique<BlockedMatrix>(*mat);
+    } else {
+        toOrder.resize(Nb);
+        fromOrder.resize(Nb);
+        CSCRowIndices = new int[nnzb];
+        CSCColPointers = new int[Nb + 1];
+        rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
+        LUmat = std::make_unique<BlockedMatrix>(*rmat);
+
+        Timer t_convert;
+        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
+        if (verbosity >= 3) {
+            std::ostringstream out;
+            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+            OpmLog::info(out.str());
         }
+    }
 
-        Timer t_analysis;
-        std::ostringstream out;
-        if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
-            out << "BILU0 reordering strategy: " << "level_scheduling\n";
-            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
-        } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
-            out << "BILU0 reordering strategy: " << "graph_coloring\n";
-            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
-        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
-            out << "BILU0 reordering strategy: none\n";
-            // numColors = 1;
-            // rowsPerColor.emplace_back(Nb);
-            numColors = Nb;
-            for(int i = 0; i < Nb; ++i){
-                rowsPerColor.emplace_back(1);
-            }
-        } else {
-            OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
-        }
-        if(verbosity >= 1){
-            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+    Timer t_analysis;
+    std::ostringstream out;
+    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
+        out << "BILU0 reordering strategy: " << "level_scheduling\n";
+        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
+        out << "BILU0 reordering strategy: " << "graph_coloring\n";
+        findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+    } else if (opencl_ilu_reorder == ILUReorder::NONE) {
+        out << "BILU0 reordering strategy: none\n";
+        // numColors = 1;
+        // rowsPerColor.emplace_back(Nb);
+        numColors = Nb;
+        for (int i = 0; i < Nb; ++i) {
+            rowsPerColor.emplace_back(1);
         }
+    } else {
+        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
+    }
+    if (verbosity >= 1) {
+        out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+    }
 #if CHOW_PATEL
-        out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
+    out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
 #endif
-        OpmLog::info(out.str());
+    OpmLog::info(out.str());
 
 
-        if (opencl_ilu_reorder != ILUReorder::NONE) {
-            delete[] CSCRowIndices;
-            delete[] CSCColPointers;
-        }
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        delete[] CSCRowIndices;
+        delete[] CSCColPointers;
+    }
 
-        diagIndex.resize(mat->Nb);
-        invDiagVals = new double[mat->Nb * bs * bs];
+    diagIndex.resize(mat->Nb);
+    invDiagVals = new double[mat->Nb * bs * bs];
 
 #if CHOW_PATEL
-        Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
-        Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
 #endif
 
-        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
+    LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
 
-        s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
-        s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
-        s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
+    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
+    s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+    s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
 #if CHOW_PATEL
-        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
-        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
-        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
-        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
-        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
-        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+    s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+    s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
 #else
-        s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
-        s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
-        s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
+    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+    s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
+    s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
 #endif
 
-        events.resize(2);
-        err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
+    events.resize(2);
+    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
 
-        rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
-        for (int i = 0; i < numColors; ++i) {
-            rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
-        }
-        err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
+    for (int i = 0; i < numColors; ++i) {
+        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
+    }
+    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
 
-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
-        }
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }
 
     return true;
 } // end init()
 
 
+template <unsigned int block_size>
+bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    const unsigned int bs = block_size;
+    auto *m = mat;
 
-    template <unsigned int block_size>
-    bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
-    {
-        const unsigned int bs = block_size;
-        auto *m = mat;
-
-        if (opencl_ilu_reorder != ILUReorder::NONE) {
-            m = rmat.get();
-            Timer t_reorder;
-            reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
-
-            if (verbosity >= 3){
-                std::ostringstream out;
-                out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
-                OpmLog::info(out.str());
-            }
-        }
-
-        // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
-        // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
-        Timer t_copy;
-        memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
-
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 memcpy: " << t_copy.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-#if CHOW_PATEL
-        chowPatelIlu.decomposition(queue, context,
-            LUmat.get(), Lmat.get(), Umat.get(),
-            invDiagVals, diagIndex,
-            s.diagIndex, s.invDiagVals,
-            s.Lvals, s.Lcols, s.Lrows,
-            s.Uvals, s.Ucols, s.Urows);
-#else
-        Timer t_copyToGpu;
-
-        events.resize(1);
-        err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
-
-        std::call_once(pattern_uploaded, [&](){
-            // find the positions of each diagonal block
-            // must be done after reordering
-            for (int row = 0; row < Nb; ++row) {
-                int rowStart = LUmat->rowPointers[row];
-                int rowEnd = LUmat->rowPointers[row+1];
-
-                auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
-                assert(candidate != LUmat->colIndices + rowEnd);
-                diagIndex[row] = candidate - LUmat->colIndices;
-            }
-            events.resize(4);
-            err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
-            err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
-            err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
-        });
-
-        cl::WaitForEvents(events);
-        events.clear();
-        if (err != CL_SUCCESS) {
-            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
-            OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
-        }
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        m = rmat.get();
+        Timer t_reorder;
+        reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
 
         if (verbosity >= 3) {
             std::ostringstream out;
-            out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
-            OpmLog::info(out.str());
-        }
-
-        Timer t_decomposition;
-        std::ostringstream out;
-        cl::Event event;
-        for (int color = 0; color < numColors; ++color) {
-            const unsigned int firstRow = rowsPerColorPrefix[color];
-            const unsigned int lastRow = rowsPerColorPrefix[color+1];
-            if (verbosity >= 4) {
-                out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
-            }
-            OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
-        }
-
-        if (verbosity >= 3) {
-            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
-            OpmLog::info(out.str());
-        }
-#endif // CHOW_PATEL
-
-        return true;
-    } // end create_preconditioner()
-
-    // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
-    // however, if individual kernel calls are timed, waiting for events is needed
-    // behavior on other GPUs is untested
-    template <unsigned int block_size>
-    void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
-    {
-        const double relaxation = 0.9;
-        cl::Event event;
-        Timer t_apply;
-
-        for(int color = 0; color < numColors; ++color){
-#if CHOW_PATEL
-            OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
-#else
-            OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
-#endif
-        }
-
-        for(int color = numColors-1; color >= 0; --color){
-#if CHOW_PATEL
-            OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
-#else
-            OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
-#endif
-        }
-
-        // apply relaxation
-        OpenclKernels::scale(x, relaxation, N);
-
-        if (verbosity >= 4) {
-            std::ostringstream out;
-            out << "BILU0 apply: " << t_apply.stop() << " s";
+            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
             OpmLog::info(out.str());
         }
     }
 
+    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+    // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
+    Timer t_copy;
+    memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "BILU0 memcpy: " << t_copy.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+#if CHOW_PATEL
+    chowPatelIlu.decomposition(queue, context,
+                               LUmat.get(), Lmat.get(), Umat.get(),
+                               invDiagVals, diagIndex,
+                               s.diagIndex, s.invDiagVals,
+                               s.Lvals, s.Lcols, s.Lrows,
+                               s.Uvals, s.Ucols, s.Urows);
+#else
+    Timer t_copyToGpu;
+
+    events.resize(1);
+    err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+
+    std::call_once(pattern_uploaded, [&]() {
+        // find the positions of each diagonal block
+        // must be done after reordering
+        for (int row = 0; row < Nb; ++row) {
+            int rowStart = LUmat->rowPointers[row];
+            int rowEnd = LUmat->rowPointers[row + 1];
+
+            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+            assert(candidate != LUmat->colIndices + rowEnd);
+            diagIndex[row] = candidate - LUmat->colIndices;
+        }
+        events.resize(4);
+        err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
+        err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+    });
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Timer t_decomposition;
+    std::ostringstream out;
+    cl::Event event;
+    for (int color = 0; color < numColors; ++color) {
+        const unsigned int firstRow = rowsPerColorPrefix[color];
+        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
+        if (verbosity >= 4) {
+            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+        }
+        OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
+    }
+
+    if (verbosity >= 3) {
+        out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+        OpmLog::info(out.str());
+    }
+#endif // CHOW_PATEL
+
+    return true;
+} // end create_preconditioner()
+
+
+// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
+// however, if individual kernel calls are timed, waiting for events is needed
+// behavior on other GPUs is untested
+template <unsigned int block_size>
+void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+{
+    const double relaxation = 0.9;
+    cl::Event event;
+    Timer t_apply;
+
+    for (int color = 0; color < numColors; ++color) {
+#if CHOW_PATEL
+        OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
+#else
+        OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
+#endif
+    }
+
+    for (int color = numColors - 1; color >= 0; --color) {
+#if CHOW_PATEL
+        OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
+#else
+        OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
+#endif
+    }
+
+    // apply relaxation
+    OpenclKernels::scale(x, relaxation, N);
+
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "BILU0 apply: " << t_apply.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
 
 
 #define INSTANTIATE_BDA_FUNCTIONS(n) \
diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp
index a9dcc9bb0..9561b39e5 100644
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -36,90 +36,90 @@ namespace Opm
 namespace Accelerator
 {
 
-    /// This class implements a Blocked ILU0 preconditioner
-    /// The decomposition is done on CPU, and reorders the rows of the matrix
-    template <unsigned int block_size>
-    class BILU0 : public Preconditioner<block_size>
-    {
-        typedef Preconditioner<block_size> Base;
+/// This class implements a Blocked ILU0 preconditioner
+/// The decomposition is done on CPU, and reorders the rows of the matrix
+template <unsigned int block_size>
+class BILU0 : public Preconditioner<block_size>
+{
+    typedef Preconditioner<block_size> Base;
 
-        using Base::N;
-        using Base::Nb;
-        using Base::nnz;
-        using Base::nnzb;
-        using Base::verbosity;
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
 
-    private:
-        std::unique_ptr<BlockedMatrix> LUmat = nullptr;
-        std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
+private:
+    std::unique_ptr<BlockedMatrix> LUmat = nullptr;
+    std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
 #if CHOW_PATEL
-        std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
+    std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
-        double *invDiagVals = nullptr;
-        std::vector<int> diagIndex;
-        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
-        std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
-        std::vector<int> toOrder, fromOrder;
-        int numColors;
-        std::once_flag pattern_uploaded;
+    double *invDiagVals = nullptr;
+    std::vector<int> diagIndex;
+    std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
+    std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
+    std::vector<int> toOrder, fromOrder;
+    int numColors;
+    std::once_flag pattern_uploaded;
 
-        ILUReorder opencl_ilu_reorder;
+    ILUReorder opencl_ilu_reorder;
 
-        typedef struct {
-            cl::Buffer invDiagVals;
-            cl::Buffer diagIndex;
-            cl::Buffer rowsPerColor;
+    typedef struct {
+        cl::Buffer invDiagVals;
+        cl::Buffer diagIndex;
+        cl::Buffer rowsPerColor;
 #if CHOW_PATEL
-            cl::Buffer Lvals, Lcols, Lrows;
-            cl::Buffer Uvals, Ucols, Urows;
+        cl::Buffer Lvals, Lcols, Lrows;
+        cl::Buffer Uvals, Ucols, Urows;
 #else
-            cl::Buffer LUvals, LUcols, LUrows;
+        cl::Buffer LUvals, LUcols, LUrows;
 #endif
-        } GPU_storage;
+    } GPU_storage;
 
-        GPU_storage s;
-        cl::Context *context;
-        cl::CommandQueue *queue;
-        std::vector<cl::Event> events;
-        cl_int err;
+    GPU_storage s;
+    cl::Context *context;
+    cl::CommandQueue *queue;
+    std::vector<cl::Event> events;
+    cl_int err;
 
 #if CHOW_PATEL
-        ChowPatelIlu<block_size> chowPatelIlu;
+    ChowPatelIlu<block_size> chowPatelIlu;
 #endif
 
-    public:
+public:
 
-        BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
+    BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
 
-        ~BILU0();
+    ~BILU0();
 
-        void init(int Nb, int nnzb, std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
+    void init(int Nb, int nnzb, std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
 
-        // analysis, find reordering if specified
-        bool analyze_matrix(BlockedMatrix *mat) override;
+    // analysis, find reordering if specified
+    bool analyze_matrix(BlockedMatrix *mat) override;
 
-        // ilu_decomposition
-        bool create_preconditioner(BlockedMatrix *mat) override;
+    // ilu_decomposition
+    bool create_preconditioner(BlockedMatrix *mat) override;
 
-        // apply preconditioner, x = prec(y)
-        void apply(const cl::Buffer& y, cl::Buffer& x) override;
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y, cl::Buffer& x) override;
 
-        int* getToOrder() override
-        {
-            return toOrder.data();
-        }
+    int* getToOrder() override
+    {
+        return toOrder.data();
+    }
 
-        int* getFromOrder() override
-        {
-            return fromOrder.data();
-        }
+    int* getFromOrder() override
+    {
+        return fromOrder.data();
+    }
 
-        BlockedMatrix* getRMat() override
-        {
-            return rmat.get();
-        }
+    BlockedMatrix* getRMat() override
+    {
+        return rmat.get();
+    }
 
-    };
+};
 
 } // namespace Accelerator
 } // namespace Opm