Merge pull request #3815 from Tongdongq/subdomain-matrix-for-opencl

Subdomain matrix for opencl ILU preconditioner
2025-02-25 18:55:30 -06:00 · 2022-05-20 16:05:14 +02:00
parent 97ccaca239 b00d31042c
commit 02aaec82f7
36 changed files with 679 additions and 320 deletions
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -161,7 +161,6 @@ namespace Opm
            // Set it up manually
            ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout());
            detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
-
            useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
 #if HAVE_FPGA
            // check usage of MatrixAddWellContributions: for FPGA they must be included
@@ -212,6 +211,17 @@ namespace Opm
                // to the original one with a deleter that does nothing.
                // Outch! We need to be able to scale the linear system! Hence const_cast
                matrix_ = const_cast<Matrix*>(&M.istlMatrix());
+
+                // setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver)
+                numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
+                useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+                if (numJacobiBlocks_ > 1) {
+                    const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
+                    detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_,
+                                               wellConnectionsGraph_, numJacobiBlocks_);
+                    std::cout << "Create block-Jacobi pattern" << std::endl;
+                    blockJacobiAdjacency();
+                }
            } else {
                // Pointers should not change
                if ( &(M.istlMatrix()) != matrix_ ) {
@@ -264,7 +274,7 @@ namespace Opm
            if (use_gpu || use_fpga) {
                const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
                auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
-                bdaBridge->initWellContributions(*wellContribs);
+                bdaBridge->initWellContributions(*wellContribs, x.N() * x[0].N());

                // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
 #if HAVE_CUDA || HAVE_OPENCL
@@ -273,8 +283,15 @@ namespace Opm
                }
 #endif

+                if (numJacobiBlocks_ > 1) {
+                    copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_);
                // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
+                    bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), &*blockJacobiForGPUILU0_,
+                                            numJacobiBlocks_, *rhs_, *wellContribs, result);
+                }
+                else
+                    bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), const_cast<Matrix*>(&getMatrix()),
+                                            numJacobiBlocks_, *rhs_, *wellContribs, result);
                if (result.converged) {
                    // get result vector x from non-Dune backend, iff solve was successful
                    bdaBridge->get_result(x);
@@ -497,6 +514,68 @@ namespace Opm
                }
        }

+        /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
+        /// Do not initialize the values, that is done in copyMatToBlockJac()
+        void blockJacobiAdjacency()
+        {
+            const auto& grid = simulator_.vanguard().grid();
+            std::vector<int> cell_part = simulator_.vanguard().cellPartition();
+
+            typedef typename Matrix::size_type size_type;
+            typedef typename Matrix::CreateIterator Iter;
+            size_type numCells = grid.size( 0 );
+            blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, getMatrix().nonzeroes(), Matrix::row_wise));
+
+            const auto& lid = grid.localIdSet();
+            const auto& gridView = grid.leafGridView();
+            auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
+
+            //Loop over cells
+            for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
+            {
+                const auto& elem = *elemIt;
+                size_type idx = lid.id(elem);
+                row.insert(idx);
+
+                // Add well non-zero connections
+                for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc) {
+                    row.insert(*wc);
+                }
+
+                int locPart = cell_part[idx];
+
+                //Add neighbor if it is on the same part
+                auto isend = gridView.iend(elem);
+                for (auto is = gridView.ibegin(elem); is!=isend; ++is)
+                {
+                    //check if face has neighbor
+                    if (is->neighbor())
+                    {
+                        size_type nid = lid.id(is->outside());
+                        int nabPart = cell_part[nid];
+                        if (locPart == nabPart) {
+                            row.insert(nid);
+                        }
+                    }
+                }
+            }
+        }
+
+        void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
+        {
+            auto rbegin = blockJac.begin();
+            auto rend = blockJac.end();
+            auto outerRow = mat.begin();
+            for (auto row = rbegin; row != rend; ++row, ++outerRow) {
+                auto outerCol = (*outerRow).begin();
+                for (auto col = (*row).begin(); col != (*row).end(); ++col) {
+                    // outerRow is guaranteed to have all column entries that row has!
+                    while(outerCol.index() < col.index()) ++outerCol;
+                    assert(outerCol.index() == col.index());
+                    *col = *outerCol; // copy nonzero block
+                }
+            }
+        }

        Matrix& getMatrix()
        {
@@ -517,6 +596,8 @@ namespace Opm
        Matrix* matrix_;
        Vector *rhs_;

+        std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
+
        std::unique_ptr<FlexibleSolverType> flexibleSolver_;
        std::unique_ptr<AbstractOperatorType> linearOperatorForFlexibleSolver_;
        std::unique_ptr<WellModelAsLinearOperator<WellModel, Vector, Vector>> wellOperator_;
@@ -530,6 +611,7 @@ namespace Opm
        FlowLinearSolverParameters parameters_;
        PropertyTree prm_;
        bool scale_variables_;
+        int numJacobiBlocks_;

        std::shared_ptr< CommunicationType > comm_;
    }; // end ISTLSolver
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -191,23 +191,43 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
 }


-template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unused]] BridgeMatrix* mat,
-                                                                     [[maybe_unused]] BridgeVector& b,
-                                                                     [[maybe_unused]] WellContributions& wellContribs,
-                                                                     [[maybe_unused]] InverseOperatorResult& res)
-{
+// check if the nnz values of the matrix are in contiguous memory
+// this is done by checking if the distance between the last value of the last block of row 0 and
+// the first value of the first row of row 1 is equal to 1
+// if the matrix only has 1 row, it is always contiguous
+template <class BridgeMatrix>
+void checkMemoryContiguous(const BridgeMatrix& mat) {
+    auto block_size = mat[0][0].N();
+    auto row = mat.begin();
+    auto last_of_row0 = row->begin();

+    // last_of_row0 points to last block, not to row->end()
+    for(auto tmp = row->begin(); tmp != row->end(); ++tmp) {
+        last_of_row0 = tmp;
+    }
+
+    bool isContiguous = mat.N() < 2 || std::distance(&((*last_of_row0)[block_size-1][block_size-1]), &(*mat[1].begin())[0][0]) == 1;
+
+    if (!isContiguous) {
+        OPM_THROW(std::logic_error, "Error memory of Matrix looks not contiguous");
+    }
+}
+
+
+template <class BridgeMatrix, class BridgeVector, int block_size>
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
+                                                                     BridgeMatrix* jacMat,
+                                                                     int numJacobiBlocks,
+                                                                     BridgeVector& b,
+                                                                     WellContributions& wellContribs,
+                                                                     InverseOperatorResult& res)
+{
    if (use_gpu || use_fpga) {
        BdaResult result;
        result.converged = false;
-        static std::vector<int> h_rows;
-        static std::vector<int> h_cols;
-        const int dim = (*mat)[0][0].N();
-        const int Nb = mat->N();
-        const int N = Nb * dim;
-        const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
-        const int nnz = nnzb * dim * dim;
+        const int dim = (*bridgeMat)[0][0].N();
+        const int Nb = bridgeMat->N();
+        const int nnzb = bridgeMat->nonzeroes();

        if (dim != 3) {
            OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
@@ -215,26 +235,47 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
            return;
        }

-        if (h_rows.capacity() == 0) {
+        if (!matrix) {
            h_rows.reserve(Nb+1);
            h_cols.reserve(nnzb);
-            copySparsityPatternFromISTL(*mat, h_rows, h_cols);
+            copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
+            checkMemoryContiguous(*bridgeMat);
+            matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
        }

        Dune::Timer t_zeros;
-        int numZeros = checkZeroDiagonal(*mat);
+        int numZeros = checkZeroDiagonal(*bridgeMat);
        if (verbosity >= 2) {
            std::ostringstream out;
            out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
            OpmLog::info(out.str());
        }

+        if (numJacobiBlocks >= 2) {
+            const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
+
+            if (!jacMatrix) {
+                h_jacRows.reserve(Nb+1);
+                h_jacCols.reserve(jacNnzb);
+                copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
+                checkMemoryContiguous(*jacMat);
+                jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
+            }
+
+            Dune::Timer t_zeros2;
+            int jacNumZeros = checkZeroDiagonal(*jacMat);
+            if (verbosity >= 2) {
+                std::ostringstream out;
+                out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
+                OpmLog::info(out.str());
+            }
+        }

        /////////////////////////
        // actually solve
+        // assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
+        SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);

-        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
-        SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
        switch(status) {
        case SolverStatus::BDA_SOLVER_SUCCESS:
            //OpmLog::info("BdaSolver converged");
@@ -268,7 +309,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unuse
 }

 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs) {
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
+                                                                              [[maybe_unused]] unsigned N) {
    if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
        const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
@@ -277,6 +319,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
    }
+    wellContribs.setVectorSize(N);
 }

 // the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -23,6 +23,7 @@
 #include "dune/istl/solver.hh" // for struct InverseOperatorResult

 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>

 namespace Opm
@@ -43,6 +44,10 @@ private:
    bool use_fpga = false;
    std::string accelerator_mode;
    std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
+    std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::vector<int> h_rows, h_cols;  // store the sparsity pattern of the matrix
+    std::vector<int> h_jacRows, h_jacCols;  // store the sparsity pattern of the jacMatrix

 public:
    /// Construct a BdaBridge
@@ -61,11 +66,13 @@ public:

    /// Solve linear system, A*x = b
    /// \warning Values of A might get overwritten!
-    /// \param[in] mat          matrix A, should be of type Dune::BCRSMatrix
-    /// \param[in] b            vector b, should be of type Dune::BlockVector
-    /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
-    /// \param[inout] result    summary of solver result
-    void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
+    /// \param[in] bridgeMat       matrix A, should be of type Dune::BCRSMatrix
+    /// \param[in] jacMat          matrix A, but modified for the preconditioner, should be of type Dune::BCRSMatrix
+    /// \param[in] numJacobiBlocks number of jacobi blocks in jacMat
+    /// \param[in] b               vector b, should be of type Dune::BlockVector
+    /// \param[in] wellContribs    contains all WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] result       summary of solver result
+    void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);

    /// Get the resulting x vector
    /// \param[inout] x    vector x, should be of type Dune::BlockVector
@@ -86,7 +93,8 @@ public:
    /// Initialize the WellContributions object with opencl context and queue
    /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
    /// \param[in] wellContribs   container to hold all WellContributions
-    void initWellContributions(WellContributions& wellContribs);
+    /// \param[in] N              number of rows in scalar vector that wellContribs will be applied on
+    void initWellContributions(WellContributions& wellContribs, unsigned N);

    /// Return whether the BdaBridge will use the FPGA or not
    /// return whether the BdaBridge will use the FPGA or not
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@@ -22,6 +22,7 @@


 #include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

 #include <string>

@@ -85,9 +86,8 @@ namespace Accelerator {
        virtual ~BdaSolver() {};

        /// Define as pure virtual functions, so derivedclass must implement them
-        virtual SolverStatus solve_system(int N, int nnz, int dim,
-            double *vals, int *rows, int *cols,
-            double *b, WellContributions& wellContribs, BdaResult &res) = 0;
+        virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+            std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;

        virtual void get_result(double *x) = 0;

--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@@ -37,14 +37,12 @@ namespace Accelerator

 using Opm::OpmLog;

-/*Sort a row of matrix elements from a blocked CSR-format.*/

-void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
-    const unsigned int bs = block_size;
+
+void sortRow(int *colIndices, int *data, int left, int right) {
    int l = left;
    int r = right;
    int middle = colIndices[(l + r) >> 1];
-    double lDatum[bs * bs];
    do {
        while (colIndices[l] < middle)
            l++;
@@ -54,9 +52,9 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
            int lColIndex = colIndices[l];
            colIndices[l] = colIndices[r];
            colIndices[r] = lColIndex;
-            memcpy(lDatum, data + l * bs * bs, sizeof(double) * bs * bs);
-            memcpy(data + l * bs * bs, data + r * bs * bs, sizeof(double) * bs * bs);
-            memcpy(data + r * bs * bs, lDatum, sizeof(double) * bs * bs);
+            int tmp = data[l];
+            data[l] = data[r];
+            data[r] = tmp;

            l++;
            r--;
@@ -64,10 +62,10 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
    } while (l < r);

    if (left < r)
-        sortBlockedRow(colIndices, data, left, r, bs);
+        sortRow(colIndices, data, left, r);

    if (right > l)
-        sortBlockedRow(colIndices, data, l, right, bs);
+        sortRow(colIndices, data, l, right);
 }


--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@@ -127,13 +127,13 @@ public:
 };


-/// Sort a row of matrix elements from a blocked CSR-format
-/// \param[inout] colIndices     
-/// \param[inout] data           
+/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
+/// These ints aren't actually nonzeroes, but represent a mapping used later
+/// \param[inout] colIndices     represent keys in sorting
+/// \param[inout] data           sorted according to the colIndices
 /// \param[in] left              lower index of data of row
 /// \param[in] right             upper index of data of row
-/// \param[in] block_size        size of blocks in the row
-void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);
+void sortRow(int *colIndices, int *data, int left, int right);

 /// Multiply and subtract blocks
 /// a = a - (b * c)
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@@ -225,16 +225,20 @@ void FpgaSolverBackend<block_size>::get_result(double *x_)


 template <unsigned int block_size>
-SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res)
+SolverStatus FpgaSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                         double *b,
+                                                         [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                         WellContributions&,
+                                                         BdaResult &res)
 {
    if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize(matrix->Nb, matrix->nnzbzs, matrix->nnzValues, matrix->rowPointers, matrix->colIndices);
        if (!analyse_matrix()) {
            return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
        }
    }
    perf_call.emplace_back();
-    update_system(vals, b);
+    update_system(matrix->nnzValues, b);
    if (!create_preconditioner()) {
        return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
    }
@@ -252,21 +256,21 @@ SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int d


 template <unsigned int block_size>
-void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
+void FpgaSolverBackend<block_size>::initialize(int Nb_, int nnzbs, double *vals, int *rows, int *cols)
 {
    double start = second();
-    this->N = N_;
-    this->nnz = nnz_;
-    this->nnzb = nnz_ / block_size / block_size;
-    Nb = (N + dim - 1) / dim;
+    this->Nb = Nb_;
+    this->N = Nb * block_size;
+    this->nnzb = nnzbs;
+    this->nnz = nnzbs * block_size * block_size;

    // allocate host memory for matrices and vectors
    // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
    mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));

    std::ostringstream oss;
-    oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
-        "block size: " << dim << ", total nnz: " << this->nnz << "\n";
+    oss << "Initializing FPGA data, matrix size: " << this->Nb << " blockrows, nnz: " << this->nnzb << " blocks, " << \
+        "block size: " << block_size << ", total nnz: " << this->nnz << "\n";
    oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
    OpmLog::info(oss.str());

--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@@ -201,13 +201,12 @@ private:
    unsigned short rst_settle_cycles = 0;

    /// Allocate host memory
-    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
+    /// \param[in] Nb             number of blockrows
+    /// \param[in] nnzbs          number of blocks
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
    /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+    void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols);

    /// Reorder the linear system so it corresponds with the coloring
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@@ -243,18 +242,15 @@ public:
    ~FpgaSolverBackend();

    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] matrix         matrix A
    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
    /// \param[in] wellContribs   WellContributions, not used in FPGA solver because it requires them already added to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
-
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x           resulting x vector, caller must guarantee that x points to a valid array
    void get_result(double *x) override;
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@@ -29,6 +29,7 @@
 #include <random>
 #include <sstream>
 #include <vector>
+#include <cstring>

 namespace {

@@ -176,39 +177,58 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi

 /* Reorder a matrix by a specified input order.
 * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
- * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
-void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
-    assert(mat->block_size == rmat->block_size);
+ * and the from order, which contains for every node in the new matrix where it came from in the old matrix.
+ * reordermapping_nonzeroes is filled with increasing indices, and reordered using the translated colIndices as keys,
+ * this means the resulting reordermapping_nonzeroes array contains the mapping
+ */
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat){

-    const unsigned int bs = mat->block_size;
    int rIndex = 0;
-    int i, k;
-    unsigned int j;
+    std::vector<int> tmp(mat->nnzbs);
+
+    reordermapping_nonzeroes.resize(mat->nnzbs);
+    for(int i = 0; i < mat->nnzbs; ++i){
+        reordermapping_nonzeroes[i] = i;
+    }

    rmat->rowPointers[0] = 0;
-    for (i = 0; i < mat->Nb; i++) {
+    for(int i = 0; i < mat->Nb; i++){
        int thisRow = fromOrder[i];
        // put thisRow from the old matrix into row i of the new matrix
-        rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
-        for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
-            for (j = 0; j < bs * bs; j++) {
-                rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
-            }
+        rmat->rowPointers[i+1] = rmat->rowPointers[i] + mat->rowPointers[thisRow+1] - mat->rowPointers[thisRow];
+        for(int k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow+1]; k++){
+            tmp[rIndex] = reordermapping_nonzeroes[k]; // only get 1 entry per block
            rmat->colIndices[rIndex] = mat->colIndices[k];
            rIndex++;
        }
    }
    // re-assign column indices according to the new positions of the nodes referenced by the column indices
-    for (i = 0; i < mat->nnzbs; i++) {
+    for(int i = 0; i < mat->nnzbs; i++){
        rmat->colIndices[i] = toOrder[rmat->colIndices[i]];
    }
    // re-sort the column indices of every row.
-    for (i = 0; i < mat->Nb; i++) {
-        sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
+    for(int i = 0; i < mat->Nb; i++){
+        sortRow(rmat->colIndices, tmp.data(), rmat->rowPointers[i], rmat->rowPointers[i+1]-1);
+    }
+    for(int i = 0; i < mat->nnzbs; i++){
+        reordermapping_nonzeroes[i] = tmp[i];
+    }
+    // std::copy();
+}
+
+/* Reorder an array of nonzero blocks into another array, using a mapping */
+void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat){
+    assert(mat->block_size == rmat->block_size);
+
+    const unsigned int bs = mat->block_size;
+
+    for(int i = 0; i < mat->nnzbs; i++){
+        int old_idx = reordermapping_nonzeroes[i];
+        memcpy(rmat->nnzValues+i*bs*bs, mat->nnzValues+old_idx*bs*bs, sizeof(double)*bs*bs); // copy nnz block
    }
 }

-/* Reorder a matrix according to the colors that every node of the matrix has received*/
+/* Find a reorder mapping according to the colors that every node of the matrix has received */

 void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
    int reordered = 0;
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@@ -47,13 +47,22 @@ namespace Accelerator
 template <unsigned int block_size>
 int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);

-/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
-/// rMat must be allocated already
-/// \param[in] mat           matrix to be reordered
-/// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
-/// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rMat       reordered Matrix
-void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
+/// Reorder the sparsity pattern of the matrix according to the mapping in toOrder and fromOrder
+/// Also find mapping for nnz blocks
+/// rmat must be allocated already
+/// \param[in] mat                        matrix to be reordered
+/// \param[out] reordermapping_nonzeroes  contains new index for every nnz block
+/// \param[in] toOrder                    reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[in] fromOrder                  reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[out] rmat                      reordered Matrix
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
+
+/// Write nnz blocks from mat to rmat, according to the mapping in reordermapping_nonzeroes
+/// rmat must be allocated already
+/// \param[in] mat                       matrix to be reordered
+/// \param[in] reordermapping_nonzeroes  contains old index for every nnz block, so rmat_nnz[i] == mat_nnz[mapping[i]]
+/// \param[inout] rmat                   reordered Matrix
+void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat);

 /// Compute reorder mapping from the color that each node has received
 /// The toOrder, fromOrder and iters arrays must be allocated already
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -101,6 +101,10 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
    }
 }

+void WellContributions::setVectorSize(unsigned N_) {
+    N = N_;
+}
+
 void WellContributions::addNumBlocks(unsigned int numBlocks)
 {
    if (allocated) {
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -101,6 +101,10 @@ public:
    /// \param[in] dim_wells   number of rows
    void setBlockSize(unsigned int dim, unsigned int dim_wells);

+    /// Set size of vector that the wells are applied to
+    /// \param[in] N          size of vector
+    void setVectorSize(unsigned N);
+
    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
    /// \param[in] type        indicate if C, D or B is sent
    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@@ -52,14 +52,14 @@ amgclSolverBackend<block_size>::~amgclSolverBackend() {}


 template <unsigned int block_size>
-void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
-    this->N = N_;
-    this->nnz = nnz_;
-    this->nnzb = nnz_ / block_size / block_size;
+void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
+    this->Nb = Nb_;
+    this->N = Nb * block_size;
+    this->nnzb = nnzbs;
+    this->nnz = nnzbs * block_size * block_size;

-    Nb = (N + dim - 1) / dim;
    std::ostringstream out;
-    out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
    out << "DeviceID: " << deviceID << "\n";
    OpmLog::info(out.str());
@@ -360,12 +360,17 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {


 template <unsigned int block_size>
-SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) {
+SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                           double *b,
+                                                           [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                           [[maybe_unused]] WellContributions& wellContribs,
+                                                           BdaResult &res)
+{
    if (initialized == false) {
-        initialize(N_, nnz_, dim);
-        convert_sparsity_pattern(rows, cols);
+        initialize(matrix->Nb, matrix->nnzbs);
+        convert_sparsity_pattern(matrix->rowPointers, matrix->colIndices);
    }
-    convert_data(vals, rows);
+    convert_data(matrix->nnzValues, matrix->rowPointers);
    solve_system(b, res);
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@@ -96,10 +96,9 @@ private:
    std::once_flag vexcl_initialize;
 #endif
    /// Initialize host memory and determine amgcl parameters
-    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    void initialize(int N, int nnz, int dim);
+    /// \param[in] Nb               number of blockrows
+    /// \param[in] nnzbs            number of blocks
+    void initialize(int Nb, int nnzbs);

    /// Convert the BCSR sparsity pattern to a CSR one
    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
@@ -129,22 +128,15 @@ public:
    ~amgclSolverBackend();

    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
-    /// \param[in] vals_prec      array of nonzeroes for preconditioner
-    /// \param[in] rows_prec      array of rowPointers for preconditioner
-    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] matrix         matrix A
    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
-
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
    void get_result(double *x) override;
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -104,10 +104,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply ilu0
        cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                              descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
        cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                              descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);

        // spmv
@@ -135,10 +135,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply ilu0
        cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                              descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
        cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                              descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);

        // spmv
@@ -188,17 +188,25 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon


 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
-    this->N = N;
-    this->nnz = nnz;
-    this->nnzb = nnz / block_size / block_size;
-    Nb = (N + dim - 1) / dim;
+void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+    this->Nb = matrix->Nb;
+    this->N = Nb * block_size;
+    this->nnzb = matrix->nnzbs;
+    this->nnz = nnzb * block_size * block_size;
+
+    if (jacMatrix) {
+        useJacMatrix = true;
+        nnzbs_prec = jacMatrix->nnzbs;
+    } else {
+        nnzbs_prec = nnzb;
+    }
+
    std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
-    OpmLog::info(out.str());
-    out.str("");
-    out.clear();
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
+    if (useJacMatrix) {
+        out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
+    }
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
    OpmLog::info(out.str());

    cudaSetDevice(deviceID);
@@ -232,7 +240,15 @@ void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
    cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
    cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
-    cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+    if (useJacMatrix) {
+        cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
+        cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
+        cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
+    } else {
+        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        d_mCols = d_bCols;
+        d_mRows = d_bRows;
+    }
    cudaCheckLastError("Could not allocate enough memory on GPU");

    cublasSetStream(cublasHandle, stream);
@@ -261,6 +277,10 @@ void cusparseSolverBackend<block_size>::finalize() {
        cudaFree(d_t);
        cudaFree(d_v);
        cudaFree(d_mVals);
+        if (useJacMatrix) {
+            cudaFree(d_mCols);
+            cudaFree(d_mRows);
+        }
        cudaFree(d_bVals);
        cudaFree(d_bCols);
        cudaFree(d_bRows);
@@ -283,23 +303,32 @@ void cusparseSolverBackend<block_size>::finalize() {


 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
+void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
    Timer t;

 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
-        int size_row = rows[i + 1] - rows[i];
-        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    if (useJacMatrix) {
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+    } else {
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    }
 #endif

-    cudaMemcpyAsync(d_bCols, cols, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    if (useJacMatrix) {
+        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    }
    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);

@@ -314,19 +343,24 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro

 // don't copy rowpointers and colindices, they stay the same
 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
+void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
    Timer t;

 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
-        int size_row = rows[i + 1] - rows[i];
-        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    if (useJacMatrix) {
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+    } else {
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    }
 #endif

    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
@@ -341,12 +375,6 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *
 } // end update_system_on_gpu()


-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
-    cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
-}
-
-
 template <unsigned int block_size>
 bool cusparseSolverBackend<block_size>::analyse_matrix() {

@@ -380,12 +408,12 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
    cusparseCreateBsrsv2Info(&info_U);
    cudaCheckLastError("Could not create analysis info");

-    cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-                                 descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
-    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-                               descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
-    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-                               descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
+    cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
+                                 descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
+                               descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
+                               descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U);
    cudaCheckLastError();
    d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));

@@ -393,7 +421,7 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {

    // analysis of ilu LU decomposition
    cusparseDbsrilu02_analysis(cusparseHandle, order, \
-                               Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+                               Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \
                               block_size, info_M, policy, d_buffer);

    int structural_zero;
@@ -404,11 +432,11 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {

    // analysis of ilu apply
    cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-                             Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+                             Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \
                             block_size, info_L, policy, d_buffer);

    cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-                             Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+                             Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \
                             block_size, info_U, policy, d_buffer);
    cudaCheckLastError("Could not analyse level information");

@@ -428,10 +456,8 @@ template <unsigned int block_size>
 bool cusparseSolverBackend<block_size>::create_preconditioner() {
    Timer t;

-    d_mCols = d_bCols;
-    d_mRows = d_bRows;
    cusparseDbsrilu02(cusparseHandle, order, \
-                      Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+                      Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \
                      block_size, info_M, policy, d_buffer);
    cudaCheckLastError("Could not perform ilu decomposition");

@@ -480,19 +506,23 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {


 template <unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                              double *b,
+                                                              std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                              WellContributions& wellContribs,
+                                                              BdaResult &res)
+{
    if (initialized == false) {
-        initialize(N, nnz, dim);
-        copy_system_to_gpu(vals, rows, cols, b);
+        initialize(matrix, jacMatrix);
+        copy_system_to_gpu(matrix, b, jacMatrix);
    } else {
-        update_system_on_gpu(vals, rows, b);
+        update_system_on_gpu(matrix, b, jacMatrix);
    }
    if (analysis_done == false) {
        if (!analyse_matrix()) {
            return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
        }
    }
-    reset_prec_on_gpu();
    if (create_preconditioner()) {
        solve_system(wellContribs, res);
    } else {
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@@ -68,6 +68,9 @@ private:

    bool analysis_done = false;

+    bool useJacMatrix = false;
+    int nnzbs_prec;             // number of nonzero blocks in the matrix for preconditioner
+                                // could be jacMatrix or matrix

    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
@@ -75,29 +78,26 @@ private:
    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
-    /// \param[in] N                number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim              size of block
-    void initialize(int N, int nnz, int dim);
+    /// \param[in] matrix         matrix for spmv
+    /// \param[in] jacMatrix      matrix for preconditioner
+    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);

    /// Clean memory
    void finalize();

    /// Copy linear system to GPU
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
-    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols        array of columnIndices, contains nnzb values
-    /// \param[in] b           input vector, contains N values
-    void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
+    /// also copy matrix for preconditioner if needed
+    /// \param[in] matrix         matrix for spmv
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);

-    // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
-    /// \param[in] rows        array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
-    /// \param[in] b           input vector, contains N values
-    void update_system_on_gpu(double *vals, int *rows, double *b);
-
-    /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
-    void reset_prec_on_gpu();
+    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
+    /// also copy matrix for preconditioner if needed
+    /// \param[in] matrix         matrix for spmv
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);

    /// Analyse sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
@@ -126,18 +126,15 @@ public:
    ~cusparseSolverBackend();

    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] matrix         matrix A
    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
-
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    
    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
    void get_result(double *x) override;
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@@ -51,6 +51,13 @@ BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :

 template <unsigned int block_size>
 bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
+{
+    return analyze_matrix(mat, nullptr);
+}
+
+
+template <unsigned int block_size>
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
    const unsigned int bs = block_size;

@@ -62,19 +69,27 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
    std::vector<int> CSCRowIndices;
    std::vector<int> CSCColPointers;

+    auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat
+
    if (opencl_ilu_reorder == ILUReorder::NONE) {
        LUmat = std::make_unique<BlockedMatrix>(*mat);
    } else {
        toOrder.resize(Nb);
        fromOrder.resize(Nb);
-        CSCRowIndices.resize(nnzb);
+        CSCRowIndices.resize(matToDecompose->nnzbs);
        CSCColPointers.resize(Nb + 1);
        rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
-        LUmat = std::make_unique<BlockedMatrix>(*rmat);
+
+        if (jacMat) {
+            rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
+            LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
+        } else {
+            LUmat = std::make_unique<BlockedMatrix>(*rmat);
+        }

        Timer t_convert;
-        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
-        if (verbosity >= 3) {
+        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
+        if(verbosity >= 3){
            std::ostringstream out;
            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
            OpmLog::info(out.str());
@@ -85,24 +100,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
    std::ostringstream out;
    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
        out << "BILU0 reordering strategy: " << "level_scheduling\n";
-        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
+        if (jacMat) {
+            reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
+        }
    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
        out << "BILU0 reordering strategy: " << "graph_coloring\n";
-        findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
+        if (jacMat) {
+            reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
+        }
    } else if (opencl_ilu_reorder == ILUReorder::NONE) {
        out << "BILU0 reordering strategy: none\n";
        // numColors = 1;
        // rowsPerColor.emplace_back(Nb);
        numColors = Nb;
-        for (int i = 0; i < Nb; ++i) {
+        for(int i = 0; i < Nb; ++i){
            rowsPerColor.emplace_back(1);
        }
    } else {
        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
    }
-    if (verbosity >= 1) {
+    if(verbosity >= 1){
        out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
    }
+
 #if CHOW_PATEL
    out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
 #endif
@@ -112,8 +136,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
    invDiagVals.resize(mat->Nb * bs * bs);

 #if CHOW_PATEL
-    Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
-    Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
+    Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
 #endif

    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
@@ -137,7 +161,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)

    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
    for (int i = 0; i < numColors; ++i) {
-        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
+        rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
    }
    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);

@@ -149,21 +173,38 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
    }

    return true;
-} // end init()
+}
+


 template <unsigned int block_size>
 bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    return create_preconditioner(mat, nullptr);
+}
+
+
+template <unsigned int block_size>
+bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
    const unsigned int bs = block_size;
-    auto *m = mat;

-    if (opencl_ilu_reorder != ILUReorder::NONE) {
-        m = rmat.get();
+    auto *matToDecompose = jacMat;
+
+    if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
+        matToDecompose = jacMat ? jacMat : mat;
+    } else {
        Timer t_reorder;
-        reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
+        if (jacMat) {
+            matToDecompose = rJacMat.get();
+            reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
+            reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
+        } else {
+            matToDecompose = rmat.get();
+            reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
+        }

-        if (verbosity >= 3) {
+        if (verbosity >= 3){
            std::ostringstream out;
            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
            OpmLog::info(out.str());
@@ -173,18 +214,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
    Timer t_copy;
-    memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
+    memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);

-    if (verbosity >= 3) {
+    if (verbosity >= 3){
        std::ostringstream out;
        out << "BILU0 memcpy: " << t_copy.stop() << " s";
        OpmLog::info(out.str());
    }

 #if CHOW_PATEL
-    chowPatelIlu.decomposition(queue.get(), context.get(),
+    chowPatelIlu.decomposition(queue, context,
                               LUmat.get(), Lmat.get(), Umat.get(),
-                               invDiagVals.data(), diagIndex,
+                               invDiagVals, diagIndex,
                               s.diagIndex, s.invDiagVals,
                               s.Lvals, s.Lcols, s.Lrows,
                               s.Uvals, s.Ucols, s.Urows);
@@ -192,23 +233,23 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    Timer t_copyToGpu;

    events.resize(1);
-    err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);

-    std::call_once(pattern_uploaded, [&]() {
+    std::call_once(pattern_uploaded, [&](){
        // find the positions of each diagonal block
        // must be done after reordering
        for (int row = 0; row < Nb; ++row) {
            int rowStart = LUmat->rowPointers[row];
-            int rowEnd = LUmat->rowPointers[row + 1];
+            int rowEnd = LUmat->rowPointers[row+1];

            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
            assert(candidate != LUmat->colIndices + rowEnd);
            diagIndex[row] = candidate - LUmat->colIndices;
        }
        events.resize(4);
-        err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
-        err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
-        err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
+        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
+        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
    });

    cl::WaitForEvents(events);
@@ -229,8 +270,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    cl::Event event;
    for (int color = 0; color < numColors; ++color) {
        const unsigned int firstRow = rowsPerColorPrefix[color];
-        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
-        if (verbosity >= 4) {
+        const unsigned int lastRow = rowsPerColorPrefix[color+1];
+        if (verbosity >= 5) {
            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
        }
        OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
@@ -245,6 +286,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    return true;
 } // end create_preconditioner()

+
 // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@@ -55,6 +55,7 @@ class BILU0 : public Preconditioner<block_size>
 private:
    std::unique_ptr<BlockedMatrix> LUmat = nullptr;
    std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
+    std::shared_ptr<BlockedMatrix> rJacMat = nullptr; 
 #if CHOW_PATEL
    std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
@@ -68,6 +69,9 @@ private:

    ILUReorder opencl_ilu_reorder;

+    std::vector<int> reordermappingNonzeroes;    // maps nonzero blocks to new location in reordered matrix
+    std::vector<int> jacReordermappingNonzeroes; // same but for jacMatrix
+
    typedef struct {
        cl::Buffer invDiagVals;
        cl::Buffer diagIndex;
@@ -92,9 +96,11 @@ public:

    // analysis, find reordering if specified
    bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;

    // ilu_decomposition
    bool create_preconditioner(BlockedMatrix *mat) override;
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;

    // apply preconditioner, x = prec(y)
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
@@ -114,6 +120,11 @@ public:
        return rmat.get();
    }

+    BlockedMatrix* getRJacMat()
+    {
+        return rJacMat.get();
+    }
+
    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
    {
        return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
@@ -127,7 +138,6 @@ public:
        return std::make_pair(s.LUvals, s.invDiagVals);
 #endif
    }
-
 };

 } // namespace Accelerator
--- a/opm/simulators/linalg/bda/opencl/BISAI.cpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp
@@ -78,17 +78,30 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto

 template <unsigned int block_size>
 bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
+{
+    return analyze_matrix(mat, nullptr);
+}
+
+template <unsigned int block_size>
+bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
    const unsigned int bs = block_size;
+    auto *m = mat;

-    this->N = mat->Nb * bs;
-    this->Nb = mat->Nb;
-    this->nnz = mat->nnzbs * bs * bs;
-    this->nnzb = mat->nnzbs;
+    if (jacMat) {
+        m = jacMat;
+    }

-    bilu0->analyze_matrix(mat);
+    this->N = m->Nb * bs;
+    this->Nb = m->Nb;
+    this->nnz = m->nnzbs * bs * bs;
+    this->nnzb = m->nnzbs;

-    return true;
+    if (jacMat) {
+        return bilu0->analyze_matrix(mat, jacMat);
+    } else {
+        return bilu0->analyze_matrix(mat);
+    }
 }

 template <unsigned int block_size>
@@ -162,7 +175,7 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
 }

 template <unsigned int block_size>
-bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
    const unsigned int bs = block_size;

@@ -172,7 +185,11 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)

    Dune::Timer t_preconditioner;

-    bilu0->create_preconditioner(mat);
+    if (jacMat) {
+        bilu0->create_preconditioner(mat, jacMat);
+    } else {
+        bilu0->create_preconditioner(mat);
+    }

    std::call_once(initialize, [&]() {
        std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure();
@@ -255,6 +272,12 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
    return true;
 }

+template <unsigned int block_size>
+bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    return create_preconditioner(mat, nullptr);
+}
+
 template <unsigned int block_size>
 void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
    const unsigned int bs = block_size;
--- a/opm/simulators/linalg/bda/opencl/BISAI.hpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.hpp
@@ -117,9 +117,11 @@ public:

    // analysis, find reordering if specified
    bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;

    // ilu_decomposition
    bool create_preconditioner(BlockedMatrix *mat) override;
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;

    // apply preconditioner, x = prec(y)
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@@ -80,6 +80,42 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
    return success;
 }

+template <unsigned int block_size>
+bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_, jacMat);
+
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        mat = mat_;
+    } else {
+        mat = bilu0->getRMat();
+    }
+    return success;
+}
+
+template <unsigned int block_size>
+bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_, jacMat);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    return result;
+}

 template <unsigned int block_size>
 bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
@@ -101,6 +137,7 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
    return result;
 }

+
 // return the absolute value of the N elements for which the absolute value is highest
 double get_absmax(const double *data, const int N) {
    return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@@ -125,6 +125,7 @@ public:
    CPR(int verbosity, ILUReorder opencl_ilu_reorder);

    bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;

    // set own Opencl variables, but also that of the bilu0 preconditioner
    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
@@ -134,7 +135,8 @@ public:
    void apply(const cl::Buffer& y, cl::Buffer& x) override;

    bool create_preconditioner(BlockedMatrix *mat) override;
-
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    
    int* getToOrder() override
    {
        return bilu0->getToOrder();
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@@ -52,10 +52,21 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
    }
 }

+template <unsigned int block_size>
+bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+    return analyze_matrix(mat);
+}
+
+template <unsigned int block_size>
+bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+    return create_preconditioner(mat);
+}

 #define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);  \
-template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
+template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);   \
+template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
+template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *);                             \
+template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@@ -68,10 +68,14 @@ public:

    // analyze matrix, e.g. the sparsity pattern
    // probably only called once
+    // the version with two params can be overloaded, if not, it will default to using the one param version
    virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
+    virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);

    // create/update preconditioner, probably used every linear solve
+    // the version with two params can be overloaded, if not, it will default to using the one param version
    virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
+    virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);

    // get reordering mappings
    virtual int* getToOrder() = 0;
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@@ -370,7 +370,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
    }
-    if (verbosity >= 4) {
+    if (verbosity >= 3) {
        std::ostringstream out;
        out << "openclSolver::prec_apply:  " << t_prec.elapsed() << " s\n";
        out << "wellContributions::apply:  " << t_well.elapsed() << " s\n";
@@ -383,14 +383,21 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr


 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
-    this->N = N_;
-    this->nnz = nnz_;
-    this->nnzb = nnz_ / block_size / block_size;
+void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+    this->Nb = matrix->Nb;
+    this->N = Nb * block_size;
+    this->nnzb = matrix->nnzbs;
+    this->nnz = nnzb * block_size * block_size;
+
+    if (jacMatrix) {
+        useJacMatrix = true;
+    }

-    Nb = (N + dim - 1) / dim;
    std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
+    if (useJacMatrix) {
+        out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
+    }
    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
    out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
    OpmLog::info(out.str());
@@ -401,9 +408,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        prec->setOpencl(context, queue);

 #if COPY_ROW_BY_ROW
-        vals_contiguous.resize(nnz);
+        vals_contiguous = new double[N];
 #endif
-        mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
+        // mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
+        // jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
+        mat = matrix;
+        jacMat = jacMatrix;

        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
@@ -441,7 +451,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
    initialized = true;
 } // end initialize()

-
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
    if (opencl_ilu_reorder != ILUReorder::NONE) {
@@ -527,8 +536,11 @@ template <unsigned int block_size>
 bool openclSolverBackend<block_size>::analyze_matrix() {
    Timer t;

-    // bool success = bilu0->init(mat.get());
-    bool success = prec->analyze_matrix(mat.get());
+    bool success;
+    if (useJacMatrix)
+        success = prec->analyze_matrix(mat.get(), jacMat.get());
+    else
+        success = prec->analyze_matrix(mat.get());

    if (opencl_ilu_reorder == ILUReorder::NONE) {
        rmat = mat.get();
@@ -579,7 +591,11 @@ template <unsigned int block_size>
 bool openclSolverBackend<block_size>::create_preconditioner() {
    Timer t;

-    bool result = prec->create_preconditioner(mat.get());
+    bool result;
+    if (useJacMatrix)
+        result = prec->create_preconditioner(mat.get(), jacMat.get());
+    else
+        result = prec->create_preconditioner(mat.get());

    if (verbosity > 2) {
        std::ostringstream out;
@@ -639,21 +655,26 @@ void openclSolverBackend<block_size>::get_result(double *x) {


 template <unsigned int block_size>
-SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                           double *b,
+                                                           std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                           WellContributions& wellContribs,
+                                                           BdaResult &res)
+{
    if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize(matrix, jacMatrix);
        if (analysis_done == false) {
            if (!analyze_matrix()) {
                return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
            }
        }
-        update_system(vals, b, wellContribs);
+        update_system(matrix->nnzValues, b, wellContribs);
        if (!create_preconditioner()) {
            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
        copy_system_to_gpu();
    } else {
-        update_system(vals, b, wellContribs);
+        update_system(matrix->nnzValues, b, wellContribs);
        if (!create_preconditioner()) {
            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@@ -63,12 +63,15 @@ private:

    std::vector<cl::Device> devices;

+    bool useJacMatrix = false;
+
    std::unique_ptr<Preconditioner<block_size> > prec;
                                                                  // can perform blocked ILU0 and AMG on pressure component
    bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
    bool analysis_done = false;
-    std::unique_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    std::shared_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    std::shared_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
    BlockedMatrix *rmat = nullptr;                                // reordered matrix (or original if no reordering), used for spmv
    ILUReorder opencl_ilu_reorder;                                // reordering strategy
    std::vector<cl::Event> events;
@@ -130,13 +133,9 @@ private:
    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
-    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+    /// \param[in] matrix     matrix A
+    /// \param[in] jacMatrix  matrix for preconditioner
+    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);

    /// Clean memory
    void finalize();
@@ -189,21 +188,14 @@ public:
    ~openclSolverBackend();

    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
-    /// \param[in] vals_prec      array of nonzeroes for preconditioner
-    /// \param[in] rows_prec      array of rowPointers for preconditioner
-    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] matrix         matrix A
    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;

    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
    /// Data is already on the GPU
@@ -218,7 +210,7 @@ public:
    /// \param[in] context   the opencl context to be used
    /// \param[in] queue     the opencl queue to be used
    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
-
+    
 }; // end class openclSolverBackend

 } // namespace Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
@@ -35,12 +35,6 @@ using Accelerator::OpenclKernels;
 void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
    this->context = context_;
    this->queue = queue_;
- }
-
-void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
-                                     Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
-    this->kernel = kernel_;
-    this->kernel_no_reorder = kernel_no_reorder_;
 }

 void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
--- a/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
@@ -35,8 +35,6 @@ namespace Opm
 class WellContributionsOCL : public WellContributions
 {
 public:
-    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
-                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);

    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
@@ -56,8 +54,6 @@ protected:

    cl::Context* context;
    cl::CommandQueue* queue;
-    Opm::Accelerator::stdwell_apply_kernel_type* kernel;
-    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
    std::vector<cl::Event> events;

    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
--- a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
+++ b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
@@ -39,9 +39,9 @@ namespace detail
    /// \param useWellConn Boolean that is true when UseWellContribusion is true
    /// \param wellGraph Cell IDs of well cells stored in a graph.
    template<class Grid, class W>
-    void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph)
+    void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph, int numJacobiBlocks)
    {
-        if ( grid.comm().size() > 1)
+        if ( grid.comm().size() > 1 || numJacobiBlocks > 1)
        {
            Dune::CartesianIndexMapper< Grid > cartMapper( grid );
            const int numCells = cartMapper.compressedSize(); // grid.numCells()