Merge pull request #3750 from ducbueno/isai-ilu

Adds IncompleteSAI preconditioner to openclSolver
2025-02-25 18:55:30 -06:00 · 2022-01-18 12:38:39 +01:00
parent bf77bbbc10 1df9d2e51f
commit 3db9a4e3e5
18 changed files with 2551 additions and 10 deletions
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@@ -244,7 +244,6 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    return true;
 } // end create_preconditioner()

-
 // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@@ -115,6 +115,16 @@ public:
        return rmat.get();
    }

+    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
+    {
+        return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
+    }
+
+    std::pair<cl::Buffer, cl::Buffer> get_preconditioner_data()
+    {
+        return std::make_pair(s.LUvals, s.invDiagVals);
+    }
+
 };

 } // namespace Accelerator
--- a/opm/simulators/linalg/bda/BISAI.cpp
+++ b/opm/simulators/linalg/bda/BISAI.cpp
@@ -0,0 +1,275 @@
+/*
+  Copyright 2022 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <algorithm>
+#include <iostream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/BISAI.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <unsigned int block_size>
+BISAI<block_size>::BISAI(ILUReorder opencl_ilu_reorder_, int verbosity_) :
+    Preconditioner<block_size>(verbosity_)
+{
+    bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder_, verbosity_);
+}
+
+template <unsigned int block_size>
+void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
+{
+    context = context_;
+    queue = queue_;
+
+    bilu0->setOpencl(context, queue);
+}
+
+std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices){
+    std::vector<int> aux(colPointers); // colPointers must be copied to this vector
+    std::vector<int> csrToCscOffsetMap(rowIndices.size()); // map must have the same size as the indices vector
+
+    for(unsigned int row = 0; row < colPointers.size() - 1; row++){
+        for(int jj = colPointers[row]; jj < colPointers[row+1]; jj++){
+            int col = rowIndices[jj];
+            int dest = aux[col];
+            csrToCscOffsetMap[dest] = jj;
+            aux[col]++;
+        }
+    }
+
+    return csrToCscOffsetMap;
+}
+
+template <unsigned int block_size>
+bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
+{
+    const unsigned int bs = block_size;
+
+    this->N = mat->Nb * bs;
+    this->Nb = mat->Nb;
+    this->nnz = mat->nnzbs * bs * bs;
+    this->nnzb = mat->nnzbs;
+
+    bilu0->analyze_matrix(mat);
+
+    return true;
+}
+
+template <unsigned int block_size>
+void BISAI<block_size>::buildLowerSubsystemsStructures(){
+    lower.subsystemPointers.assign(Nb + 1, 0);
+
+    Dune::Timer t_buildLowerSubsystemsStructures;
+
+    for(int tcol = 0; tcol < Nb; tcol++){
+        int frow = diagIndex[tcol] + 1;
+        int lrow = colPointers[tcol + 1];
+        int nx = lrow - frow;
+        int nv = 0;
+
+        for(int sweep = 0; sweep < nx - 1; sweep++){
+            for(int xid = sweep + 1; xid < nx; xid++){
+                for(int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++){
+                    if(rowIndices[ptr] == rowIndices[frow + xid]){
+                        lower.nzIndices.push_back(csrToCscOffsetMap[ptr]);
+                        lower.knownRhsIndices.push_back(csrToCscOffsetMap[frow + sweep]);
+                        lower.unknownRhsIndices.push_back(csrToCscOffsetMap[frow + xid]);
+                        nv++;
+                    }
+                }
+            }
+        }
+
+        lower.subsystemPointers[tcol + 1] = lower.subsystemPointers[tcol] + nv;
+    }
+
+    if(verbosity >= 4){
+        std::ostringstream out;
+        out << "BISAI buildLowerSubsystemsStructures time: " << t_buildLowerSubsystemsStructures.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+template <unsigned int block_size>
+void BISAI<block_size>::buildUpperSubsystemsStructures(){
+    upper.subsystemPointers.assign(Nb + 1, 0);
+
+    Dune::Timer t_buildUpperSubsystemsStructures;
+
+    for(int tcol = 0; tcol < Nb; tcol++){
+        int frow = colPointers[tcol];
+        int lrow = diagIndex[tcol];
+        int nx = lrow - frow + 1;
+        int nv = 0;
+
+        for(int sweep = 0; sweep < nx - 1; sweep++){
+            for(int xid = 0; xid < nx; xid++){
+                for(int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++){
+                    if(rowIndices[ptr] == rowIndices[lrow - xid]){
+                        upper.nzIndices.push_back(csrToCscOffsetMap[ptr]);
+                        upper.knownRhsIndices.push_back(csrToCscOffsetMap[lrow - sweep]);
+                        upper.unknownRhsIndices.push_back(csrToCscOffsetMap[lrow - xid]);
+                        nv++;
+                    }
+                }
+            }
+        }
+
+        upper.subsystemPointers[tcol + 1] = upper.subsystemPointers[tcol] + nv;
+    }
+
+    if(verbosity >= 4){
+        std::ostringstream out;
+        out << "BISAI buildUpperSubsystemsStructures time: " << t_buildUpperSubsystemsStructures.stop() << " s";
+        OpmLog::info(out.str());
+    }
+}
+
+template <unsigned int block_size>
+bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    const unsigned int bs = block_size;
+
+    if (bs != 3) {
+        OPM_THROW(std::logic_error, "Creation of ISAI preconditioner on GPU only supports block_size = 3");
+    }
+
+    Dune::Timer t_preconditioner;
+
+    bilu0->create_preconditioner(mat);
+
+    std::call_once(initialize, [&]() {
+        std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure();
+
+        csrToCscOffsetMap = buildCsrToCscOffsetMap(colPointers, rowIndices);
+        buildLowerSubsystemsStructures();
+        buildUpperSubsystemsStructures();
+
+        d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * colPointers.size());
+        d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * rowIndices.size());
+        d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * csrToCscOffsetMap.size());
+        d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * diagIndex.size());
+        d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
+        d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
+        d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb * bs);
+        d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.subsystemPointers.size());
+        d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.subsystemPointers.size());
+
+        if(!lower.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
+            d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.nzIndices.size());
+            d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.knownRhsIndices.size());
+            d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.unknownRhsIndices.size());
+        }
+
+        if(!upper.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
+            d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.nzIndices.size());
+            d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.knownRhsIndices.size());
+            d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.unknownRhsIndices.size());
+        }
+
+        events.resize(6);
+        err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0, colPointers.size() * sizeof(int), colPointers.data(), nullptr, &events[0]);
+        err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0, rowIndices.size() * sizeof(int), rowIndices.data(), nullptr, &events[1]);
+        err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0, csrToCscOffsetMap.size() * sizeof(int), csrToCscOffsetMap.data(), nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, diagIndex.size() * sizeof(int), diagIndex.data(), nullptr, &events[3]);
+        err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0, sizeof(int) * lower.subsystemPointers.size(), lower.subsystemPointers.data(), nullptr, &events[4]);
+        err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0, sizeof(int) * upper.subsystemPointers.size(), upper.subsystemPointers.data(), nullptr, &events[5]);
+
+        if(!lower.nzIndices.empty()){
+            events.resize(events.size() + 3);
+            err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0, sizeof(int) * lower.nzIndices.size(), lower.nzIndices.data(), nullptr, &events[events.size() - 3]);
+            err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.knownRhsIndices.size(), lower.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
+            err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.unknownRhsIndices.size(), lower.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
+        }
+
+        if(!upper.nzIndices.empty()){
+            events.resize(events.size() + 3);
+            err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE, 0, sizeof(int) * upper.nzIndices.size(), upper.nzIndices.data(), nullptr, &events[events.size() - 3]);
+            err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.knownRhsIndices.size(), upper.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
+            err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.unknownRhsIndices.size(), upper.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
+        }
+
+        cl::WaitForEvents(events);
+        events.clear();
+
+        if (err != CL_SUCCESS) {
+            // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+            OPM_THROW(std::logic_error, "BISAI OpenCL enqueueWriteBuffer error");
+        }
+    });
+
+    std::tie(d_LUvals, d_invDiagVals) = bilu0->get_preconditioner_data();
+
+    events.resize(2);
+    err = queue->enqueueFillBuffer(d_invLvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[0]);
+    err |= queue->enqueueFillBuffer(d_invUvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[1]);
+    cl::WaitForEvents(events);
+    events.clear();
+
+    OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
+    OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
+            d_invDiagVals, d_invUvals, Nb);
+
+    if(verbosity >= 4){
+        std::ostringstream out;
+        out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    return true;
+}
+
+template <unsigned int block_size>
+void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
+    const unsigned int bs = block_size;
+
+    OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
+                                                                                                   // (to compensate for the unitary diagonal that is not
+                                                                                                   // included in isaiL, for simplicity)
+    OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
+}
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)  \
+template class BISAI<n>;
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+INSTANTIATE_BDA_FUNCTIONS(5);
+INSTANTIATE_BDA_FUNCTIONS(6);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+}
+}
--- a/opm/simulators/linalg/bda/BISAI.hpp
+++ b/opm/simulators/linalg/bda/BISAI.hpp
@@ -0,0 +1,156 @@
+/*
+  Copyright 2022 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BISAI_HPP
+#define BISAI_HPP
+
+#include <mutex>
+
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
+
+namespace Opm
+{
+namespace Accelerator
+{
+
+template <unsigned int block_size>
+class openclSolverBackend;
+
+class BlockedMatrix;
+
+/// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
+/// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
+template <unsigned int block_size>
+class BISAI : public Preconditioner<block_size>
+{
+    typedef Preconditioner<block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::context;
+    using Base::queue;
+    using Base::events;
+    using Base::err;
+
+private:
+    std::once_flag initialize;
+
+    std::vector<int> colPointers;
+    std::vector<int> rowIndices;
+    std::vector<int> diagIndex;
+    std::vector<int> csrToCscOffsetMap;
+    std::vector<double> invLvals;
+    std::vector<double> invUvals;
+
+    cl::Buffer d_colPointers;
+    cl::Buffer d_rowIndices;
+    cl::Buffer d_csrToCscOffsetMap;
+    cl::Buffer d_diagIndex;
+    cl::Buffer d_LUvals;
+    cl::Buffer d_invDiagVals;
+    cl::Buffer d_invLvals;
+    cl::Buffer d_invUvals;
+    cl::Buffer d_invL_x;
+
+    ILUReorder opencl_ilu_reorder;
+    std::unique_ptr<BILU0<block_size> > bilu0;
+
+    /// Struct that holds the structure of the small subsystems for each column
+    typedef struct{
+        /// This vector holds the cumulative sum for the number of non-zero blocks for each subsystem.
+        /// Works similarly to row and column pointers for the CSR and CSC matrix representations.
+        std::vector<int> subsystemPointers;
+        /// This vector holds the indices of the non-zero blocks for the target subsystem. These blocks are
+        /// the ones that are present in the shadow set of the non-zero blocks of column j of the main matrix,
+        /// as described in section 2.3 of the paper. The amount of non-zero blocks for j-th subsystem is
+        /// given by subsystemPointers[j+1] - subsystemPointers[j].
+        std::vector<int> nzIndices;
+        /// This vector holds the indices of the already known values of the right hand sides of the subsystems.
+        /// Its purpose is to aid in the parallel solution of the subsystems.
+        std::vector<int> knownRhsIndices;
+        /// This vector holds the indices of the unknown values of the right hand sides of the subsystems.
+        std::vector<int> unknownRhsIndices;
+    } subsystemStructure;
+
+    /// GPU version of subsystemStructure
+    typedef struct{
+        cl::Buffer subsystemPointers;
+        cl::Buffer nzIndices;
+        cl::Buffer knownRhsIndices;
+        cl::Buffer unknownRhsIndices;
+    } subsystemStructureGPU;
+
+    subsystemStructure lower, upper;
+    subsystemStructureGPU d_lower, d_upper;
+
+    /// An approximate inverse for L is computed by solving a small lower triangular system for each column of the main matrix.
+    /// This function finds the structure of each of these subsystems and fills the 'lower' struct.
+    void buildLowerSubsystemsStructures();
+
+    /// An approximate inverse for U is computed by solving a small upper triangular system for each column of the main matrix.
+    /// This function finds the structure of each of theses subsystems and fills the 'upper' struct.
+    void buildUpperSubsystemsStructures();
+
+public:
+    BISAI(ILUReorder opencl_ilu_reorder, int verbosity);
+
+    // set own Opencl variables, but also that of the bilu0 preconditioner
+    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
+
+    // analysis, find reordering if specified
+    bool analyze_matrix(BlockedMatrix *mat) override;
+
+    // ilu_decomposition
+    bool create_preconditioner(BlockedMatrix *mat) override;
+
+    // apply preconditioner, x = prec(y)
+    void apply(const cl::Buffer& y, cl::Buffer& x) override;
+
+    int* getToOrder() override
+    {
+        return bilu0->getToOrder();
+    }
+
+    int* getFromOrder() override
+    {
+        return bilu0->getFromOrder();
+    }
+
+    BlockedMatrix* getRMat() override
+    {
+        return bilu0->getRMat();
+    }
+};
+
+/// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.
+/// The map works as follows: if an element 'e' of the matrix is in the i-th position in the CSR representation, it will be
+/// in the csrToCscOffsetMap[i]-th position in the CSC representation.
+std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices);
+
+} // namespace Accelerator
+} // namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@@ -23,6 +23,7 @@
 #include <opm/common/ErrorMacros.hpp>

 #include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/BISAI.hpp>
 #include <opm/simulators/linalg/bda/CPR.hpp>
 #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>

@@ -44,6 +45,8 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
        return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_reorder, verbosity);
    } else if (type == PreconditionerType::CPR) {
        return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_reorder);
+    } else if (type == PreconditionerType::BISAI) {
+        return std::make_unique<Opm::Accelerator::BISAI<block_size> >(opencl_ilu_reorder, verbosity);
    } else {
        OPM_THROW(std::logic_error, "Invalid PreconditionerType");
    }
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@@ -54,7 +54,8 @@ protected:
 public:
    enum PreconditionerType {
        BILU0,
-        CPR
+        CPR,
+        BISAI
    };

    static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder);
--- a/opm/simulators/linalg/bda/opencl/kernels/isaiL.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/isaiL.cl
@@ -0,0 +1,80 @@
+__kernel void block_sub(__global double *mat1, __global double *mat2, __global double *result)
+{
+    const unsigned int bs = 3;
+    const unsigned int warpsize = 32;
+    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int lane = idx_t % warpsize;
+
+    if(lane < num_active_threads){
+        const unsigned int row = lane % bs;
+        const unsigned int col = (lane / bs) % bs;
+        result[bs * row + col] = mat1[bs * row + col] - mat2[bs * row + col];
+    }
+}
+
+__kernel void block_mult_sub_isai(__global double *a, __global double *b, __global double *c)
+{
+    const unsigned int bs = 3;
+    const unsigned int warpsize = 32;
+    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int lane = idx_t % warpsize;
+
+    if(lane < num_active_threads){
+        const unsigned int row = lane % bs;
+        const unsigned int col = (lane / bs) % bs;
+        double temp = 0.0;
+
+        for (unsigned int k = 0; k < bs; k++) {
+            temp += b[bs * row + k] * c[bs * k + col];
+        }
+
+        a[bs * row + col] -= temp;
+    }
+}
+
+__kernel void isaiL(__global const int *diagIndex,
+                    __global const int *colPtr,
+                    __global const int *mapping,
+                    __global const int *nvc,
+                    __global const int *luIdxs,
+                    __global const int *xxIdxs,
+                    __global const int *dxIdxs,
+                    __global const double *LU,
+                    __global double *invL,
+                    const unsigned int Nb)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int idx_b = get_group_id(0);
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int idx = get_global_id(0);
+    const unsigned int bs = 3;
+    const unsigned int num_threads = get_global_size(0);
+    const unsigned int num_warps_in_grid = num_threads / warpsize;
+    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
+    const unsigned int num_blocks_per_warp = warpsize / bs / bs;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+    unsigned int tcol = idx / warpsize;
+
+    while(tcol < Nb){
+        const unsigned int frow = diagIndex[tcol] + 1;
+        const unsigned int lrow = colPtr[tcol + 1];
+        const unsigned int nx = lrow - frow;
+
+        if(lane < num_active_threads){
+            for(unsigned int xid = 0; xid < nx; xid++){
+                unsigned int xpos = mapping[frow + xid];
+                block_sub(invL + xpos * bs * bs, LU + xpos * bs * bs, invL + xpos * bs * bs);
+            }
+
+            for(unsigned int v = nvc[tcol]; v < nvc[tcol + 1]; v++){
+                block_mult_sub_isai(invL + xxIdxs[v] * bs * bs, LU + luIdxs[v] * bs * bs, invL + dxIdxs[v] * bs * bs);
+            }
+        }
+
+        tcol += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/isaiU.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/isaiU.cl
@@ -0,0 +1,84 @@
+__kernel void block_add(__global double *mat1, __global double *mat2, __global double *result)
+{
+    const unsigned int bs = 3;
+    const unsigned int warpsize = 32;
+    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int lane = idx_t % warpsize;
+
+    if(lane < num_active_threads){
+        const unsigned int row = lane % bs;
+        const unsigned int col = (lane / bs) % bs;
+        result[bs * row + col] = mat1[bs * row + col] + mat2[bs * row + col];
+    }
+}
+
+__kernel void block_mult_isai(__global double *mat1, __global double *mat2, __global double *result)
+{
+    const unsigned int bs = 3;
+    const unsigned int warpsize = 32;
+    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int lane = idx_t % warpsize;
+
+    if(lane < num_active_threads){
+        const unsigned int row = lane % bs;
+        const unsigned int col = (lane / bs) % bs;
+        double temp = 0.0;
+
+        for (unsigned int k = 0; k < bs; k++) {
+            temp += mat1[bs * row + k] * mat2[bs * k + col];
+        }
+
+        result[bs * row + col] = temp;
+    }
+}
+
+__kernel void isaiU(__global const int *diagIndex,
+                    __global const int *colPtr,
+                    __global const int *rowIndices,
+                    __global const int *mapping,
+                    __global const int *nvc,
+                    __global const int *luIdxs,
+                    __global const int *xxIdxs,
+                    __global const int *dxIdxs,
+                    __global const double *LU,
+                    __global const double *invDiagVals,
+                    __global double *invU,
+                    const unsigned int Nb)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int idx_b = get_group_id(0);
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int idx = get_global_id(0);
+    const unsigned int bs = 3;
+    const unsigned int num_threads = get_global_size(0);
+    const unsigned int num_warps_in_grid = num_threads / warpsize;
+    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
+    const unsigned int num_blocks_per_warp = warpsize / bs / bs;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+    unsigned int tcol = idx / warpsize;
+
+    while(tcol < Nb){
+        const unsigned int frow = colPtr[tcol];
+        const unsigned int lrow = diagIndex[tcol];
+        const unsigned int nx = lrow - frow + 1;
+
+        if(lane < num_active_threads){
+            block_add(invU + lrow * bs * bs, invDiagVals + tcol * bs * bs, invU + lrow * bs * bs);
+
+            for(unsigned int v = nvc[tcol]; v < nvc[tcol + 1]; v++){
+                block_mult_sub_isai(invU + xxIdxs[v] * bs * bs, LU + luIdxs[v] * bs * bs, invU + dxIdxs[v] * bs * bs);
+            }
+
+            for(unsigned int xid = 1; xid < nx; xid++){
+                unsigned int xpos = mapping[lrow - xid];
+                block_mult_isai(invDiagVals + rowIndices[lrow - xid] * bs * bs, invU + xpos * bs * bs, invU + xpos * bs * bs);
+            }
+        }
+
+        tcol += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked_add.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked_add.cl
@@ -0,0 +1,68 @@
+/// b = mat * x
+/// algorithm based on:
+/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__kernel void spmv_blocked_add(
+    __global const double *vals,
+    __global const int *cols,
+    __global const int *rows,
+    const int Nb,
+    __global const double *x,
+    __global double *out,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int idx_t = get_local_id(0);
+    unsigned int idx = idx_b * bsize + idx_t;
+    const unsigned int bs = block_size;
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int target_block_row = idx / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    // for 3x3 blocks:
+    // num_active_threads: 27
+    // num_blocks_per_warp: 3
+
+    while(target_block_row < Nb){
+        unsigned int first_block = rows[target_block_row];
+        unsigned int last_block = rows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            for(; block < last_block; block += num_blocks_per_warp){
+                double x_elem = x[cols[block]*bs + c];
+                double A_elem = vals[block*bs*bs + c + r*bs];
+                local_out += x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lane < bs){
+            unsigned int row = target_block_row*bs + lane;
+            out[row] = tmp[lane];
+            out[row] += x[row];
+        }
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/openclKernels.cpp
@@ -52,6 +52,7 @@ std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, c
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
 std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
 std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
+std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
 std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
 std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
 std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
@@ -61,7 +62,8 @@ std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
 std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
 std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
 std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
-
+std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
+std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;

 // divide A by B, and round up: return (int)ceil(A/B)
 unsigned int ceilDivision(const unsigned int A, const unsigned int B)
@@ -90,6 +92,7 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    sources.emplace_back(add_coarse_pressure_correction_str);
    sources.emplace_back(prolongate_vector_str);
    sources.emplace_back(spmv_blocked_str);
+    sources.emplace_back(spmv_blocked_add_str);
    sources.emplace_back(spmv_str);
    sources.emplace_back(spmv_noreset_str);
    sources.emplace_back(residual_blocked_str);
@@ -104,6 +107,8 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    sources.emplace_back(stdwell_apply_str);
    sources.emplace_back(stdwell_apply_no_reorder_str);
    sources.emplace_back(ILU_decomp_str);
+    sources.emplace_back(isaiL_str);
+    sources.emplace_back(isaiU_str);

    cl::Program program = cl::Program(*context, sources);
    program.build(devices);
@@ -122,6 +127,7 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
    prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
    spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
+    spmv_blocked_add_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked_add")));
    spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
    spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
    residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked")));
@@ -131,6 +137,8 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
    stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
    ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
+    isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
+    isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));

    initialized = true;
 } // end get_opencl_kernels()
@@ -311,7 +319,7 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
    }
 }

-void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
+void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset, bool add)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@@ -321,7 +329,11 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, c
    cl::Event event;

    if (block_size > 1) {
-        event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+        if (add) {
+            event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+        } else {
+            event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+        }
    } else {
        if (reset) {
            event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
@@ -460,5 +472,44 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
    }
 }

+void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
+    cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
+{
+    const unsigned int work_group_size = 256;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+
+    Timer t_isaiL;
+    cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+            diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels isaiL() time: " << t_isaiL.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
+        cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
+        cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
+{
+    const unsigned int work_group_size = 256;
+    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+
+    Timer t_isaiU;
+    cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+            diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "OpenclKernels isaiU() time: " << t_isaiU.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -31,9 +31,9 @@ namespace Accelerator
 {

 using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
-                                         cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
+                                         const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
 using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
-                                         cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
+                                         const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
 using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
                                         cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
 using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
@@ -52,6 +52,10 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
                                                             cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
 using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
+using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
+using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;

 class OpenclKernels
 {
@@ -71,6 +75,7 @@ private:
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
    static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
    static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
+    static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_add_k;
    static std::unique_ptr<spmv_kernel_type> spmv_k;
    static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
    static std::unique_ptr<residual_blocked_kernel_type> residual_blocked_k;
@@ -80,6 +85,8 @@ private:
    static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
    static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
    static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
+    static std::unique_ptr<isaiL_kernel_type> isaiL_k;
+    static std::unique_ptr<isaiU_kernel_type> isaiU_k;

    OpenclKernels(){}; // disable instantiation

@@ -94,6 +101,7 @@ public:
    static const std::string add_coarse_pressure_correction_str;
    static const std::string prolongate_vector_str;
    static const std::string spmv_blocked_str;
+    static const std::string spmv_blocked_add_str;
    static const std::string spmv_str;
    static const std::string spmv_noreset_str;
    static const std::string residual_blocked_str;
@@ -108,6 +116,8 @@ public:
    static const std::string stdwell_apply_str;
    static const std::string stdwell_apply_no_reorder_str;
    static const std::string ILU_decomp_str;
+    static const std::string isaiL_str;
+    static const std::string isaiU_str;

    static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);

@@ -120,7 +130,7 @@ public:
    static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
    static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
    static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
-    static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
+    static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false);
    static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);

    static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
@@ -139,6 +149,13 @@ public:
    static void apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
        cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
        int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
+
+    static void isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
+            cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb);
+
+    static void isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
+            cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
+            cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
 };

 } // namespace Accelerator
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -47,11 +47,17 @@ using Dune::Timer;
 template <unsigned int block_size>
 openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {

-    bool use_cpr;
+    bool use_cpr, use_isai;
+
    if (linsolver.compare("ilu0") == 0) {
        use_cpr = false;
+        use_isai = false;
    } else if (linsolver.compare("cpr_quasiimpes") == 0) {
        use_cpr = true;
+        use_isai = false;
+    } else if (linsolver.compare("isai") == 0) {
+        use_cpr = false;
+        use_isai = true;
    } else if (linsolver.compare("cpr_trueimpes") == 0) {
        OPM_THROW(std::logic_error, "Error openclSolver does not support --linsolver=cpr_trueimpes");
    } else {
@@ -61,6 +67,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
    using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType;
    if (use_cpr) {
        prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_reorder);
+    } else if (use_isai) {
+        prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_reorder);
    } else {
        prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_reorder);
    }
--- a/opm/simulators/linalg/setupPropertyTree.cpp
+++ b/opm/simulators/linalg/setupPropertyTree.cpp
@@ -84,10 +84,15 @@ setupPropertyTree(FlowLinearSolverParameters p, // Note: copying the parameters
        return setupILU(conf, p);
    }

+    // Same configuration as ILU0.
+    if (conf == "isai") {
+        return setupISAI(conf, p);
+    }
+
    // No valid configuration option found.
    OPM_THROW(std::invalid_argument,
              conf << " is not a valid setting for --linear-solver-configuration."
-              << " Please use ilu0, cpr, cpr_trueimpes, or cpr_quasiimpes");
+              << " Please use ilu0, cpr, cpr_trueimpes, cpr_quasiimpes or isai");
 }

 PropertyTree
@@ -187,5 +192,20 @@ setupILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParamet
 }


+PropertyTree
+setupISAI([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p)
+{
+    using namespace std::string_literals;
+    PropertyTree prm;
+    prm.put("tol", p.linear_solver_reduction_);
+    prm.put("maxiter", p.linear_solver_maxiter_);
+    prm.put("verbosity", p.linear_solver_verbosity_);
+    prm.put("solver", "bicgstab"s);
+    prm.put("preconditioner.type", "ParOverILU0"s);
+    prm.put("preconditioner.relaxation", p.ilu_relaxation_);
+    prm.put("preconditioner.ilulevel", p.ilu_fillin_level_);
+    return prm;
+}
+

 } // namespace Opm
--- a/opm/simulators/linalg/setupPropertyTree.hpp
+++ b/opm/simulators/linalg/setupPropertyTree.hpp
@@ -35,6 +35,7 @@ PropertyTree setupPropertyTree(FlowLinearSolverParameters p,
 PropertyTree setupCPR(const std::string& conf, const FlowLinearSolverParameters& p);
 PropertyTree setupAMG(const std::string& conf, const FlowLinearSolverParameters& p);
 PropertyTree setupILU(const std::string& conf, const FlowLinearSolverParameters& p);
+PropertyTree setupISAI(const std::string& conf, const FlowLinearSolverParameters& p);

 } // namespace Opm