diff --git a/ebos/eclbasevanguard.hh b/ebos/eclbasevanguard.hh
index 2b29f9282..30a3c1aa8 100644
--- a/ebos/eclbasevanguard.hh
+++ b/ebos/eclbasevanguard.hh
@@ -85,6 +85,10 @@ struct EdgeWeightsMethod {
     using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
+struct NumJacobiBlocks {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
 struct OwnerCellsFirst {
     using type = UndefinedProperty;
 };
@@ -133,6 +137,10 @@ struct EdgeWeightsMethod<TypeTag, TTag::EclBaseVanguard> {
     static constexpr int value = 1;
 };
 template<class TypeTag>
+struct NumJacobiBlocks<TypeTag, TTag::EclBaseVanguard> {
+    static constexpr int value = 0;
+};
+template<class TypeTag>
 struct OwnerCellsFirst<TypeTag, TTag::EclBaseVanguard> {
     static constexpr bool value = true;
 };
@@ -211,6 +219,8 @@ public:
                              "When restarting: should we try to initialize wells and groups from historical SCHEDULE section.");
         EWOMS_REGISTER_PARAM(TypeTag, int, EdgeWeightsMethod,
                              "Choose edge-weighing strategy: 0=uniform, 1=trans, 2=log(trans).");
+        EWOMS_REGISTER_PARAM(TypeTag, int, NumJacobiBlocks,
+                             "Number of blocks to be created for the Block-Jacobi preconditioner.");
         EWOMS_REGISTER_PARAM(TypeTag, bool, OwnerCellsFirst,
                              "Order cells owned by rank before ghost/overlap cells.");
         EWOMS_REGISTER_PARAM(TypeTag, bool, SerialPartitioning,
@@ -235,6 +245,7 @@ public:
     {
         fileName_ = EWOMS_GET_PARAM(TypeTag, std::string, EclDeckFileName);
         edgeWeightsMethod_   = Dune::EdgeWeightMethod(EWOMS_GET_PARAM(TypeTag, int, EdgeWeightsMethod));
+        numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
         ownersFirst_ = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
         serialPartitioning_ = EWOMS_GET_PARAM(TypeTag, bool, SerialPartitioning);
         zoltanImbalanceTol_ = EWOMS_GET_PARAM(TypeTag, double, ZoltanImbalanceTol);
diff --git a/ebos/eclcpgridvanguard.hh b/ebos/eclcpgridvanguard.hh
index c35fed0bd..34b0439c8 100644
--- a/ebos/eclcpgridvanguard.hh
+++ b/ebos/eclcpgridvanguard.hh
@@ -136,7 +136,7 @@ public:
                              this->serialPartitioning(), this->enableDistributedWells(),
                              this->zoltanImbalanceTol(), this->gridView(),
                              this->schedule(), this->centroids_,
-                             this->eclState(), this->parallelWells_);
+                             this->eclState(), this->parallelWells_, this->numJacobiBlocks());
 #endif
 
         this->updateGridView_();
diff --git a/ebos/eclgenericcpgridvanguard.cc b/ebos/eclgenericcpgridvanguard.cc
index 27025a882..2692c3cb4 100644
--- a/ebos/eclgenericcpgridvanguard.cc
+++ b/ebos/eclgenericcpgridvanguard.cc
@@ -82,12 +82,13 @@ void EclGenericCpGridVanguard<ElementMapper,GridView,Scalar>::doLoadBalance_(Dun
                                                                              const Schedule& schedule,
                                                                              std::vector<double>& centroids,
                                                                              EclipseState& eclState1,
-                                                                             EclGenericVanguard::ParallelWellStruct& parallelWells)
+                                                                             EclGenericVanguard::ParallelWellStruct& parallelWells,
+                                                                             int numJacobiBlocks)
 {
     int mpiSize = 1;
     MPI_Comm_size(grid_->comm(), &mpiSize);
 
-    if (mpiSize > 1) {
+    if (mpiSize > 1 || numJacobiBlocks > 1) {
         // the CpGrid's loadBalance() method likes to have the transmissibilities as
         // its edge weights. since this is (kind of) a layering violation and
         // transmissibilities are relatively expensive to compute, we only do it if
@@ -131,53 +132,61 @@ void EclGenericCpGridVanguard<ElementMapper,GridView,Scalar>::doLoadBalance_(Dun
         }
 
         //distribute the grid and switch to the distributed view.
-        {
-            const auto wells = schedule.getWellsatEnd();
-
-            try
+        if (mpiSize > 1) {
             {
-                auto& eclState = dynamic_cast<ParallelEclipseState&>(eclState1);
-                const EclipseGrid* eclGrid = nullptr;
+                const auto wells = schedule.getWellsatEnd();
 
-                if (grid_->comm().rank() == 0)
+                try
                 {
-                    eclGrid = &eclState.getInputGrid();
-                }
+                    auto& eclState = dynamic_cast<ParallelEclipseState&>(eclState1);
+                    const EclipseGrid* eclGrid = nullptr;
 
-                PropsCentroidsDataHandle<Dune::CpGrid> handle(*grid_, eclState, eclGrid, centroids,
-                                                              cartesianIndexMapper());
-                if (loadBalancerSet)
-                {
-                    std::vector<int> parts;
                     if (grid_->comm().rank() == 0)
                     {
-                        parts =  (*externalLoadBalancer)(*grid_);
+                        eclGrid = &eclState.getInputGrid();
                     }
-                    parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1));
-                }
-                else
-                {
-                    parallelWells =
-                        std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning,
-                                                       faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol,
-                                                       enableDistributedWells));
-                }
-            }
-            catch(const std::bad_cast& e)
-            {
-                std::ostringstream message;
-                message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState ("
-                        << e.what() <<")"<<std::flush;
-                OpmLog::error(message.str());
-                std::rethrow_exception(std::current_exception());
-            }
-        }
-        grid_->switchToDistributedView();
 
+                    PropsCentroidsDataHandle<Dune::CpGrid> handle(*grid_, eclState, eclGrid, centroids,
+                                                                  cartesianIndexMapper());
+                    if (loadBalancerSet)
+                    {
+                        std::vector<int> parts;
+                        if (grid_->comm().rank() == 0)
+                        {
+                         parts =  (*externalLoadBalancer)(*grid_);
+                        }
+                        parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1));
+                    }
+                    else
+                    {
+                        parallelWells =
+                            std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning,
+                                                           faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol,
+                                                           enableDistributedWells));
+                    }
+                }
+                catch(const std::bad_cast& e)
+                {
+                    std::ostringstream message;
+                    message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState ("
+                            << e.what() <<")"<<std::flush;
+                    OpmLog::error(message.str());
+                    std::rethrow_exception(std::current_exception());
+                }
+            }
+            grid_->switchToDistributedView();
+        }
+        
         // Calling Schedule::filterConnections would remove any perforated
         // cells that exist only on other ranks even in the case of distributed wells
         // But we need all connections to figure out the first cell of a well (e.g. for
         // pressure). Hence this is now skipped. Rank 0 had everything even before.
+
+        if (numJacobiBlocks > 1 && mpiSize == 1) {
+            const auto wells = schedule.getWellsatEnd();
+            cell_part_.resize(grid_->numCells());
+            cell_part_ = grid_->zoltanPartitionWithoutScatter(&wells, faceTrans.data(), numJacobiBlocks, zoltanImbalanceTol);
+        }
     }
 }
 
diff --git a/ebos/eclgenericcpgridvanguard.hh b/ebos/eclgenericcpgridvanguard.hh
index 310c2a14c..13a7172e0 100644
--- a/ebos/eclgenericcpgridvanguard.hh
+++ b/ebos/eclgenericcpgridvanguard.hh
@@ -101,6 +101,10 @@ public:
      */
     const CartesianIndexMapper& equilCartesianIndexMapper() const;
 
+    std::vector<int> cellPartition() const
+    {
+        return cell_part_;
+    }
 protected:
     /*!
      * \brief Distribute the simulation grid over multiple processes
@@ -114,7 +118,8 @@ protected:
                         const GridView& gridv, const Schedule& schedule,
                         std::vector<double>& centroids,
                         EclipseState& eclState,
-                        EclGenericVanguard::ParallelWellStruct& parallelWells);
+                        EclGenericVanguard::ParallelWellStruct& parallelWells,
+                        int numJacobiBlocks);
 
     void distributeFieldProps_(EclipseState& eclState);
 #endif
@@ -137,6 +142,7 @@ protected:
     std::unique_ptr<CartesianIndexMapper> equilCartesianIndexMapper_;
 
     int mpiRank;
+    std::vector<int> cell_part_;
 };
 
 } // namespace Opm
diff --git a/ebos/eclgenericvanguard.hh b/ebos/eclgenericvanguard.hh
index 05b441584..200ecb508 100644
--- a/ebos/eclgenericvanguard.hh
+++ b/ebos/eclgenericvanguard.hh
@@ -249,6 +249,12 @@ public:
     Dune::EdgeWeightMethod edgeWeightsMethod() const
     { return edgeWeightsMethod_; }
 
+    /*!
+     * \brief Number of blocks in the Block-Jacobi preconditioner.
+     */
+    int numJacobiBlocks() const
+    { return numJacobiBlocks_; }
+
     /*!
      * \brief Parameter that decide if cells owned by rank are ordered before ghost cells.
      */
@@ -323,6 +329,7 @@ protected:
     std::string caseName_;
     std::string fileName_;
     Dune::EdgeWeightMethod edgeWeightsMethod_;
+    int numJacobiBlocks_;
     bool ownersFirst_;
     bool serialPartitioning_;
     double zoltanImbalanceTol_;
diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp
index a77737b87..890fcc4ae 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -161,7 +161,6 @@ namespace Opm
             // Set it up manually
             ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout());
             detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
-
             useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
 #if HAVE_FPGA
             // check usage of MatrixAddWellContributions: for FPGA they must be included
@@ -212,6 +211,17 @@ namespace Opm
                 // to the original one with a deleter that does nothing.
                 // Outch! We need to be able to scale the linear system! Hence const_cast
                 matrix_ = const_cast<Matrix*>(&M.istlMatrix());
+
+                // setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver)
+                numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
+                useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+                if (numJacobiBlocks_ > 1) {
+                    const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
+                    detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_,
+                                               wellConnectionsGraph_, numJacobiBlocks_);
+                    std::cout << "Create block-Jacobi pattern" << std::endl;
+                    blockJacobiAdjacency();
+                }
             } else {
                 // Pointers should not change
                 if ( &(M.istlMatrix()) != matrix_ ) {
@@ -264,7 +274,7 @@ namespace Opm
             if (use_gpu || use_fpga) {
                 const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
                 auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
-                bdaBridge->initWellContributions(*wellContribs);
+                bdaBridge->initWellContributions(*wellContribs, x.N() * x[0].N());
 
                 // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
 #if HAVE_CUDA || HAVE_OPENCL
@@ -273,8 +283,15 @@ namespace Opm
                 }
 #endif
 
+                if (numJacobiBlocks_ > 1) {
+                    copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_);
                 // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
+                    bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), &*blockJacobiForGPUILU0_,
+                                            numJacobiBlocks_, *rhs_, *wellContribs, result);
+                }
+                else
+                    bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), const_cast<Matrix*>(&getMatrix()),
+                                            numJacobiBlocks_, *rhs_, *wellContribs, result);
                 if (result.converged) {
                     // get result vector x from non-Dune backend, iff solve was successful
                     bdaBridge->get_result(x);
@@ -497,6 +514,68 @@ namespace Opm
                 }
         }
 
+        /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
+        /// Do not initialize the values, that is done in copyMatToBlockJac()
+        void blockJacobiAdjacency()
+        {
+            const auto& grid = simulator_.vanguard().grid();
+            std::vector<int> cell_part = simulator_.vanguard().cellPartition();
+
+            typedef typename Matrix::size_type size_type;
+            typedef typename Matrix::CreateIterator Iter;
+            size_type numCells = grid.size( 0 );
+            blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, getMatrix().nonzeroes(), Matrix::row_wise));
+
+            const auto& lid = grid.localIdSet();
+            const auto& gridView = grid.leafGridView();
+            auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
+
+            //Loop over cells
+            for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
+            {
+                const auto& elem = *elemIt;
+                size_type idx = lid.id(elem);
+                row.insert(idx);
+
+                // Add well non-zero connections
+                for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc) {
+                    row.insert(*wc);
+                }
+
+                int locPart = cell_part[idx];
+
+                //Add neighbor if it is on the same part
+                auto isend = gridView.iend(elem);
+                for (auto is = gridView.ibegin(elem); is!=isend; ++is)
+                {
+                    //check if face has neighbor
+                    if (is->neighbor())
+                    {
+                        size_type nid = lid.id(is->outside());
+                        int nabPart = cell_part[nid];
+                        if (locPart == nabPart) {
+                            row.insert(nid);
+                        }
+                    }
+                }
+            }
+        }
+
+        void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
+        {
+            auto rbegin = blockJac.begin();
+            auto rend = blockJac.end();
+            auto outerRow = mat.begin();
+            for (auto row = rbegin; row != rend; ++row, ++outerRow) {
+                auto outerCol = (*outerRow).begin();
+                for (auto col = (*row).begin(); col != (*row).end(); ++col) {
+                    // outerRow is guaranteed to have all column entries that row has!
+                    while(outerCol.index() < col.index()) ++outerCol;
+                    assert(outerCol.index() == col.index());
+                    *col = *outerCol; // copy nonzero block
+                }
+            }
+        }
 
         Matrix& getMatrix()
         {
@@ -517,6 +596,8 @@ namespace Opm
         Matrix* matrix_;
         Vector *rhs_;
 
+        std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
+
         std::unique_ptr<FlexibleSolverType> flexibleSolver_;
         std::unique_ptr<AbstractOperatorType> linearOperatorForFlexibleSolver_;
         std::unique_ptr<WellModelAsLinearOperator<WellModel, Vector, Vector>> wellOperator_;
@@ -530,6 +611,7 @@ namespace Opm
         FlowLinearSolverParameters parameters_;
         PropertyTree prm_;
         bool scale_variables_;
+        int numJacobiBlocks_;
 
         std::shared_ptr< CommunicationType > comm_;
     }; // end ISTLSolver
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index ee7c6e7c8..822561773 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -191,23 +191,43 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
 }
 
 
-template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unused]] BridgeMatrix* mat,
-                                                                     [[maybe_unused]] BridgeVector& b,
-                                                                     [[maybe_unused]] WellContributions& wellContribs,
-                                                                     [[maybe_unused]] InverseOperatorResult& res)
-{
+// check if the nnz values of the matrix are in contiguous memory
+// this is done by checking if the distance between the last value of the last block of row 0 and
+// the first value of the first row of row 1 is equal to 1
+// if the matrix only has 1 row, it is always contiguous
+template <class BridgeMatrix>
+void checkMemoryContiguous(const BridgeMatrix& mat) {
+    auto block_size = mat[0][0].N();
+    auto row = mat.begin();
+    auto last_of_row0 = row->begin();
 
+    // last_of_row0 points to last block, not to row->end()
+    for(auto tmp = row->begin(); tmp != row->end(); ++tmp) {
+        last_of_row0 = tmp;
+    }
+
+    bool isContiguous = mat.N() < 2 || std::distance(&((*last_of_row0)[block_size-1][block_size-1]), &(*mat[1].begin())[0][0]) == 1;
+
+    if (!isContiguous) {
+        OPM_THROW(std::logic_error, "Error memory of Matrix looks not contiguous");
+    }
+}
+
+
+template <class BridgeMatrix, class BridgeVector, int block_size>
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
+                                                                     BridgeMatrix* jacMat,
+                                                                     int numJacobiBlocks,
+                                                                     BridgeVector& b,
+                                                                     WellContributions& wellContribs,
+                                                                     InverseOperatorResult& res)
+{
     if (use_gpu || use_fpga) {
         BdaResult result;
         result.converged = false;
-        static std::vector<int> h_rows;
-        static std::vector<int> h_cols;
-        const int dim = (*mat)[0][0].N();
-        const int Nb = mat->N();
-        const int N = Nb * dim;
-        const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
-        const int nnz = nnzb * dim * dim;
+        const int dim = (*bridgeMat)[0][0].N();
+        const int Nb = bridgeMat->N();
+        const int nnzb = bridgeMat->nonzeroes();
 
         if (dim != 3) {
             OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
@@ -215,26 +235,47 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
             return;
         }
 
-        if (h_rows.capacity() == 0) {
+        if (!matrix) {
             h_rows.reserve(Nb+1);
             h_cols.reserve(nnzb);
-            copySparsityPatternFromISTL(*mat, h_rows, h_cols);
+            copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
+            checkMemoryContiguous(*bridgeMat);
+            matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
         }
 
         Dune::Timer t_zeros;
-        int numZeros = checkZeroDiagonal(*mat);
+        int numZeros = checkZeroDiagonal(*bridgeMat);
         if (verbosity >= 2) {
             std::ostringstream out;
             out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
             OpmLog::info(out.str());
         }
 
+        if (numJacobiBlocks >= 2) {
+            const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
+
+            if (!jacMatrix) {
+                h_jacRows.reserve(Nb+1);
+                h_jacCols.reserve(jacNnzb);
+                copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
+                checkMemoryContiguous(*jacMat);
+                jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
+            }
+
+            Dune::Timer t_zeros2;
+            int jacNumZeros = checkZeroDiagonal(*jacMat);
+            if (verbosity >= 2) {
+                std::ostringstream out;
+                out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
+                OpmLog::info(out.str());
+            }
+        }
 
         /////////////////////////
         // actually solve
+        // assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
+        SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
 
-        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
-        SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
         switch(status) {
         case SolverStatus::BDA_SOLVER_SUCCESS:
             //OpmLog::info("BdaSolver converged");
@@ -268,7 +309,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unuse
 }
 
 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs) {
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
+                                                                              [[maybe_unused]] unsigned N) {
     if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
         const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
@@ -277,6 +319,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
         OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
     }
+    wellContribs.setVectorSize(N);
 }
 
 // the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index 3d24051bb..4e79b9eae 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -23,6 +23,7 @@
 #include "dune/istl/solver.hh" // for struct InverseOperatorResult
 
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 
 namespace Opm
@@ -43,6 +44,10 @@ private:
     bool use_fpga = false;
     std::string accelerator_mode;
     std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
+    std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::vector<int> h_rows, h_cols;  // store the sparsity pattern of the matrix
+    std::vector<int> h_jacRows, h_jacCols;  // store the sparsity pattern of the jacMatrix
 
 public:
     /// Construct a BdaBridge
@@ -61,11 +66,13 @@ public:
 
     /// Solve linear system, A*x = b
     /// \warning Values of A might get overwritten!
-    /// \param[in] mat          matrix A, should be of type Dune::BCRSMatrix
-    /// \param[in] b            vector b, should be of type Dune::BlockVector
-    /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
-    /// \param[inout] result    summary of solver result
-    void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
+    /// \param[in] bridgeMat       matrix A, should be of type Dune::BCRSMatrix
+    /// \param[in] jacMat          matrix A, but modified for the preconditioner, should be of type Dune::BCRSMatrix
+    /// \param[in] numJacobiBlocks number of jacobi blocks in jacMat
+    /// \param[in] b               vector b, should be of type Dune::BlockVector
+    /// \param[in] wellContribs    contains all WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] result       summary of solver result
+    void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
 
     /// Get the resulting x vector
     /// \param[inout] x    vector x, should be of type Dune::BlockVector
@@ -86,7 +93,8 @@ public:
     /// Initialize the WellContributions object with opencl context and queue
     /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
     /// \param[in] wellContribs   container to hold all WellContributions
-    void initWellContributions(WellContributions& wellContribs);
+    /// \param[in] N              number of rows in scalar vector that wellContribs will be applied on
+    void initWellContributions(WellContributions& wellContribs, unsigned N);
 
     /// Return whether the BdaBridge will use the FPGA or not
     /// return whether the BdaBridge will use the FPGA or not
diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp
index d84652d28..dd5b97094 100644
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@@ -22,6 +22,7 @@
 
 
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 
 #include <string>
 
@@ -85,9 +86,8 @@ namespace Accelerator {
         virtual ~BdaSolver() {};
 
         /// Define as pure virtual functions, so derivedclass must implement them
-        virtual SolverStatus solve_system(int N, int nnz, int dim,
-            double *vals, int *rows, int *cols,
-            double *b, WellContributions& wellContribs, BdaResult &res) = 0;
+        virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+            std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
 
         virtual void get_result(double *x) = 0;
 
diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp
index 5fbb73e19..c5b2074e7 100644
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@@ -37,14 +37,12 @@ namespace Accelerator
 
 using Opm::OpmLog;
 
-/*Sort a row of matrix elements from a blocked CSR-format.*/
 
-void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
-    const unsigned int bs = block_size;
+
+void sortRow(int *colIndices, int *data, int left, int right) {
     int l = left;
     int r = right;
     int middle = colIndices[(l + r) >> 1];
-    double lDatum[bs * bs];
     do {
         while (colIndices[l] < middle)
             l++;
@@ -54,9 +52,9 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
             int lColIndex = colIndices[l];
             colIndices[l] = colIndices[r];
             colIndices[r] = lColIndex;
-            memcpy(lDatum, data + l * bs * bs, sizeof(double) * bs * bs);
-            memcpy(data + l * bs * bs, data + r * bs * bs, sizeof(double) * bs * bs);
-            memcpy(data + r * bs * bs, lDatum, sizeof(double) * bs * bs);
+            int tmp = data[l];
+            data[l] = data[r];
+            data[r] = tmp;
 
             l++;
             r--;
@@ -64,10 +62,10 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
     } while (l < r);
 
     if (left < r)
-        sortBlockedRow(colIndices, data, left, r, bs);
+        sortRow(colIndices, data, left, r);
 
     if (right > l)
-        sortBlockedRow(colIndices, data, l, right, bs);
+        sortRow(colIndices, data, l, right);
 }
 
 
diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp
index eab5c06ee..ba120e523 100644
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@@ -127,13 +127,13 @@ public:
 };
 
 
-/// Sort a row of matrix elements from a blocked CSR-format
-/// \param[inout] colIndices     
-/// \param[inout] data           
+/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
+/// These ints aren't actually nonzeroes, but represent a mapping used later
+/// \param[inout] colIndices     represent keys in sorting
+/// \param[inout] data           sorted according to the colIndices
 /// \param[in] left              lower index of data of row
 /// \param[in] right             upper index of data of row
-/// \param[in] block_size        size of blocks in the row
-void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);
+void sortRow(int *colIndices, int *data, int left, int right);
 
 /// Multiply and subtract blocks
 /// a = a - (b * c)
diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
index 9087d4074..875b96676 100644
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@@ -225,16 +225,20 @@ void FpgaSolverBackend<block_size>::get_result(double *x_)
 
 
 template <unsigned int block_size>
-SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res)
+SolverStatus FpgaSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                         double *b,
+                                                         [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                         WellContributions&,
+                                                         BdaResult &res)
 {
     if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize(matrix->Nb, matrix->nnzbzs, matrix->nnzValues, matrix->rowPointers, matrix->colIndices);
         if (!analyse_matrix()) {
             return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
         }
     }
     perf_call.emplace_back();
-    update_system(vals, b);
+    update_system(matrix->nnzValues, b);
     if (!create_preconditioner()) {
         return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
     }
@@ -252,21 +256,21 @@ SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int d
 
 
 template <unsigned int block_size>
-void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
+void FpgaSolverBackend<block_size>::initialize(int Nb_, int nnzbs, double *vals, int *rows, int *cols)
 {
     double start = second();
-    this->N = N_;
-    this->nnz = nnz_;
-    this->nnzb = nnz_ / block_size / block_size;
-    Nb = (N + dim - 1) / dim;
+    this->Nb = Nb_;
+    this->N = Nb * block_size;
+    this->nnzb = nnzbs;
+    this->nnz = nnzbs * block_size * block_size;
 
     // allocate host memory for matrices and vectors
     // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
     mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));
 
     std::ostringstream oss;
-    oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
-        "block size: " << dim << ", total nnz: " << this->nnz << "\n";
+    oss << "Initializing FPGA data, matrix size: " << this->Nb << " blockrows, nnz: " << this->nnzb << " blocks, " << \
+        "block size: " << block_size << ", total nnz: " << this->nnz << "\n";
     oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
     OpmLog::info(oss.str());
 
diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.hpp b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
index 4d1e766f3..05e7c6d33 100644
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@@ -201,13 +201,12 @@ private:
     unsigned short rst_settle_cycles = 0;
 
     /// Allocate host memory
-    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
+    /// \param[in] Nb             number of blockrows
+    /// \param[in] nnzbs          number of blocks
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
     /// \param[in] rows           array of rowPointers, contains N/dim+1 values
     /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+    void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols);
 
     /// Reorder the linear system so it corresponds with the coloring
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@@ -243,18 +242,15 @@ public:
     ~FpgaSolverBackend();
 
     /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] matrix         matrix A
     /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
     /// \param[in] wellContribs   WellContributions, not used in FPGA solver because it requires them already added to matrix A
     /// \param[inout] res         summary of solver result
     /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
-
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    
     /// Get result after linear solve, and peform postprocessing if necessary
     /// \param[inout] x           resulting x vector, caller must guarantee that x points to a valid array
     void get_result(double *x) override;
diff --git a/opm/simulators/linalg/bda/Reorder.cpp b/opm/simulators/linalg/bda/Reorder.cpp
index 3711665b3..d46d2f303 100644
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@@ -29,6 +29,7 @@
 #include <random>
 #include <sstream>
 #include <vector>
+#include <cstring>
 
 namespace {
 
@@ -176,39 +177,58 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
 
 /* Reorder a matrix by a specified input order.
  * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
- * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
-void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
-    assert(mat->block_size == rmat->block_size);
+ * and the from order, which contains for every node in the new matrix where it came from in the old matrix.
+ * reordermapping_nonzeroes is filled with increasing indices, and reordered using the translated colIndices as keys,
+ * this means the resulting reordermapping_nonzeroes array contains the mapping
+ */
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat){
 
-    const unsigned int bs = mat->block_size;
     int rIndex = 0;
-    int i, k;
-    unsigned int j;
+    std::vector<int> tmp(mat->nnzbs);
+
+    reordermapping_nonzeroes.resize(mat->nnzbs);
+    for(int i = 0; i < mat->nnzbs; ++i){
+        reordermapping_nonzeroes[i] = i;
+    }
 
     rmat->rowPointers[0] = 0;
-    for (i = 0; i < mat->Nb; i++) {
+    for(int i = 0; i < mat->Nb; i++){
         int thisRow = fromOrder[i];
         // put thisRow from the old matrix into row i of the new matrix
-        rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
-        for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
-            for (j = 0; j < bs * bs; j++) {
-                rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
-            }
+        rmat->rowPointers[i+1] = rmat->rowPointers[i] + mat->rowPointers[thisRow+1] - mat->rowPointers[thisRow];
+        for(int k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow+1]; k++){
+            tmp[rIndex] = reordermapping_nonzeroes[k]; // only get 1 entry per block
             rmat->colIndices[rIndex] = mat->colIndices[k];
             rIndex++;
         }
     }
     // re-assign column indices according to the new positions of the nodes referenced by the column indices
-    for (i = 0; i < mat->nnzbs; i++) {
+    for(int i = 0; i < mat->nnzbs; i++){
         rmat->colIndices[i] = toOrder[rmat->colIndices[i]];
     }
     // re-sort the column indices of every row.
-    for (i = 0; i < mat->Nb; i++) {
-        sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
+    for(int i = 0; i < mat->Nb; i++){
+        sortRow(rmat->colIndices, tmp.data(), rmat->rowPointers[i], rmat->rowPointers[i+1]-1);
+    }
+    for(int i = 0; i < mat->nnzbs; i++){
+        reordermapping_nonzeroes[i] = tmp[i];
+    }
+    // std::copy();
+}
+
+/* Reorder an array of nonzero blocks into another array, using a mapping */
+void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat){
+    assert(mat->block_size == rmat->block_size);
+
+    const unsigned int bs = mat->block_size;
+
+    for(int i = 0; i < mat->nnzbs; i++){
+        int old_idx = reordermapping_nonzeroes[i];
+        memcpy(rmat->nnzValues+i*bs*bs, mat->nnzValues+old_idx*bs*bs, sizeof(double)*bs*bs); // copy nnz block
     }
 }
 
-/* Reorder a matrix according to the colors that every node of the matrix has received*/
+/* Find a reorder mapping according to the colors that every node of the matrix has received */
 
 void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
     int reordered = 0;
diff --git a/opm/simulators/linalg/bda/Reorder.hpp b/opm/simulators/linalg/bda/Reorder.hpp
index 7d90a67d0..fd02e8b4f 100644
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@@ -47,13 +47,22 @@ namespace Accelerator
 template <unsigned int block_size>
 int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);
 
-/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
-/// rMat must be allocated already
-/// \param[in] mat           matrix to be reordered
-/// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
-/// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rMat       reordered Matrix
-void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
+/// Reorder the sparsity pattern of the matrix according to the mapping in toOrder and fromOrder
+/// Also find mapping for nnz blocks
+/// rmat must be allocated already
+/// \param[in] mat                        matrix to be reordered
+/// \param[out] reordermapping_nonzeroes  contains new index for every nnz block
+/// \param[in] toOrder                    reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[in] fromOrder                  reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[out] rmat                      reordered Matrix
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
+
+/// Write nnz blocks from mat to rmat, according to the mapping in reordermapping_nonzeroes
+/// rmat must be allocated already
+/// \param[in] mat                       matrix to be reordered
+/// \param[in] reordermapping_nonzeroes  contains old index for every nnz block, so rmat_nnz[i] == mat_nnz[mapping[i]]
+/// \param[inout] rmat                   reordered Matrix
+void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat);
 
 /// Compute reorder mapping from the color that each node has received
 /// The toOrder, fromOrder and iters arrays must be allocated already
diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index b4b170bd0..c92067775 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -101,6 +101,10 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
     }
 }
 
+void WellContributions::setVectorSize(unsigned N_) {
+    N = N_;
+}
+
 void WellContributions::addNumBlocks(unsigned int numBlocks)
 {
     if (allocated) {
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index d78c02111..619ce2466 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -101,6 +101,10 @@ public:
     /// \param[in] dim_wells   number of rows
     void setBlockSize(unsigned int dim, unsigned int dim_wells);
 
+    /// Set size of vector that the wells are applied to
+    /// \param[in] N          size of vector
+    void setVectorSize(unsigned N);
+
     /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
     /// \param[in] type        indicate if C, D or B is sent
     /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cpp b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
index 957575fbf..58046abb3 100644
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@@ -52,14 +52,14 @@ amgclSolverBackend<block_size>::~amgclSolverBackend() {}
 
 
 template <unsigned int block_size>
-void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
-    this->N = N_;
-    this->nnz = nnz_;
-    this->nnzb = nnz_ / block_size / block_size;
+void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
+    this->Nb = Nb_;
+    this->N = Nb * block_size;
+    this->nnzb = nnzbs;
+    this->nnz = nnzbs * block_size * block_size;
 
-    Nb = (N + dim - 1) / dim;
     std::ostringstream out;
-    out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
     out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
     out << "DeviceID: " << deviceID << "\n";
     OpmLog::info(out.str());
@@ -360,12 +360,17 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
 
 
 template <unsigned int block_size>
-SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) {
+SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                           double *b,
+                                                           [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                           [[maybe_unused]] WellContributions& wellContribs,
+                                                           BdaResult &res)
+{
     if (initialized == false) {
-        initialize(N_, nnz_, dim);
-        convert_sparsity_pattern(rows, cols);
+        initialize(matrix->Nb, matrix->nnzbs);
+        convert_sparsity_pattern(matrix->rowPointers, matrix->colIndices);
     }
-    convert_data(vals, rows);
+    convert_data(matrix->nnzValues, matrix->rowPointers);
     solve_system(b, res);
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.hpp b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
index 930f81ff9..8896ea65e 100644
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@@ -96,10 +96,9 @@ private:
     std::once_flag vexcl_initialize;
 #endif
     /// Initialize host memory and determine amgcl parameters
-    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    void initialize(int N, int nnz, int dim);
+    /// \param[in] Nb               number of blockrows
+    /// \param[in] nnzbs            number of blocks
+    void initialize(int Nb, int nnzbs);
 
     /// Convert the BCSR sparsity pattern to a CSR one
     /// \param[in] rows           array of rowPointers, contains N/dim+1 values
@@ -129,22 +128,15 @@ public:
     ~amgclSolverBackend();
 
     /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
-    /// \param[in] vals_prec      array of nonzeroes for preconditioner
-    /// \param[in] rows_prec      array of rowPointers for preconditioner
-    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] matrix         matrix A
     /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
     /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
-
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    
     /// Get result after linear solve, and peform postprocessing if necessary
     /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
     void get_result(double *x) override;
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
index 46f345784..f6728cebb 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -104,10 +104,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
 
         // apply ilu0
         cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                               descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
         cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                               descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
 
         // spmv
@@ -135,10 +135,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
 
         // apply ilu0
         cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                               descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
         cusparseDbsrsv2_solve(cusparseHandle, order, \
-                              operation, Nb, nnzb, &one, \
+                              operation, Nb, nnzbs_prec, &one, \
                               descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
 
         // spmv
@@ -188,17 +188,25 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
 
 
 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
-    this->N = N;
-    this->nnz = nnz;
-    this->nnzb = nnz / block_size / block_size;
-    Nb = (N + dim - 1) / dim;
+void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+    this->Nb = matrix->Nb;
+    this->N = Nb * block_size;
+    this->nnzb = matrix->nnzbs;
+    this->nnz = nnzb * block_size * block_size;
+
+    if (jacMatrix) {
+        useJacMatrix = true;
+        nnzbs_prec = jacMatrix->nnzbs;
+    } else {
+        nnzbs_prec = nnzb;
+    }
+
     std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
-    OpmLog::info(out.str());
-    out.str("");
-    out.clear();
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
+    if (useJacMatrix) {
+        out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
+    }
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
     OpmLog::info(out.str());
 
     cudaSetDevice(deviceID);
@@ -232,7 +240,15 @@ void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
     cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
     cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
     cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
-    cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+    if (useJacMatrix) {
+        cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
+        cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
+        cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
+    } else {
+        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        d_mCols = d_bCols;
+        d_mRows = d_bRows;
+    }
     cudaCheckLastError("Could not allocate enough memory on GPU");
 
     cublasSetStream(cublasHandle, stream);
@@ -261,6 +277,10 @@ void cusparseSolverBackend<block_size>::finalize() {
         cudaFree(d_t);
         cudaFree(d_v);
         cudaFree(d_mVals);
+        if (useJacMatrix) {
+            cudaFree(d_mCols);
+            cudaFree(d_mRows);
+        }
         cudaFree(d_bVals);
         cudaFree(d_bCols);
         cudaFree(d_bRows);
@@ -283,23 +303,32 @@ void cusparseSolverBackend<block_size>::finalize() {
 
 
 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
+void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
     Timer t;
 
 #if COPY_ROW_BY_ROW
     int sum = 0;
     for (int i = 0; i < Nb; ++i) {
-        int size_row = rows[i + 1] - rows[i];
-        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
         sum += size_row * block_size * block_size;
     }
     cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    if (useJacMatrix) {
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+    } else {
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    }
 #endif
 
-    cudaMemcpyAsync(d_bCols, cols, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    if (useJacMatrix) {
+        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    }
     cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
     cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
 
@@ -314,19 +343,24 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro
 
 // don't copy rowpointers and colindices, they stay the same
 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
+void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
     Timer t;
 
 #if COPY_ROW_BY_ROW
     int sum = 0;
     for (int i = 0; i < Nb; ++i) {
-        int size_row = rows[i + 1] - rows[i];
-        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
         sum += size_row * block_size * block_size;
     }
     cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    if (useJacMatrix) {
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+    } else {
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    }
 #endif
 
     cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
@@ -341,12 +375,6 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *
 } // end update_system_on_gpu()
 
 
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
-    cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
-}
-
-
 template <unsigned int block_size>
 bool cusparseSolverBackend<block_size>::analyse_matrix() {
 
@@ -380,12 +408,12 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
     cusparseCreateBsrsv2Info(&info_U);
     cudaCheckLastError("Could not create analysis info");
 
-    cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-                                 descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
-    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-                               descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
-    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-                               descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
+    cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
+                                 descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
+                               descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
+                               descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U);
     cudaCheckLastError();
     d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
 
@@ -393,7 +421,7 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
 
     // analysis of ilu LU decomposition
     cusparseDbsrilu02_analysis(cusparseHandle, order, \
-                               Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+                               Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \
                                block_size, info_M, policy, d_buffer);
 
     int structural_zero;
@@ -404,11 +432,11 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
 
     // analysis of ilu apply
     cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-                             Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+                             Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \
                              block_size, info_L, policy, d_buffer);
 
     cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-                             Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+                             Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \
                              block_size, info_U, policy, d_buffer);
     cudaCheckLastError("Could not analyse level information");
 
@@ -428,10 +456,8 @@ template <unsigned int block_size>
 bool cusparseSolverBackend<block_size>::create_preconditioner() {
     Timer t;
 
-    d_mCols = d_bCols;
-    d_mRows = d_bRows;
     cusparseDbsrilu02(cusparseHandle, order, \
-                      Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+                      Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \
                       block_size, info_M, policy, d_buffer);
     cudaCheckLastError("Could not perform ilu decomposition");
 
@@ -480,19 +506,23 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
 
 
 template <unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                              double *b,
+                                                              std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                              WellContributions& wellContribs,
+                                                              BdaResult &res)
+{
     if (initialized == false) {
-        initialize(N, nnz, dim);
-        copy_system_to_gpu(vals, rows, cols, b);
+        initialize(matrix, jacMatrix);
+        copy_system_to_gpu(matrix, b, jacMatrix);
     } else {
-        update_system_on_gpu(vals, rows, b);
+        update_system_on_gpu(matrix, b, jacMatrix);
     }
     if (analysis_done == false) {
         if (!analyse_matrix()) {
             return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
         }
     }
-    reset_prec_on_gpu();
     if (create_preconditioner()) {
         solve_system(wellContribs, res);
     } else {
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
index 791bf32fc..54a99f608 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@@ -68,6 +68,9 @@ private:
 
     bool analysis_done = false;
 
+    bool useJacMatrix = false;
+    int nnzbs_prec;             // number of nonzero blocks in the matrix for preconditioner
+                                // could be jacMatrix or matrix
 
     /// Solve linear system using ilu0-bicgstab
     /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
@@ -75,29 +78,26 @@ private:
     void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
-    /// \param[in] N                number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim              size of block
-    void initialize(int N, int nnz, int dim);
+    /// \param[in] matrix         matrix for spmv
+    /// \param[in] jacMatrix      matrix for preconditioner
+    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
 
     /// Clean memory
     void finalize();
 
     /// Copy linear system to GPU
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
-    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols        array of columnIndices, contains nnzb values
-    /// \param[in] b           input vector, contains N values
-    void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
+    /// also copy matrix for preconditioner if needed
+    /// \param[in] matrix         matrix for spmv
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
 
-    // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
-    /// \param[in] rows        array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
-    /// \param[in] b           input vector, contains N values
-    void update_system_on_gpu(double *vals, int *rows, double *b);
-
-    /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
-    void reset_prec_on_gpu();
+    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
+    /// also copy matrix for preconditioner if needed
+    /// \param[in] matrix         matrix for spmv
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
 
     /// Analyse sparsity pattern to extract parallelism
     /// \return true iff analysis was successful
@@ -126,18 +126,15 @@ public:
     ~cusparseSolverBackend();
 
     /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] matrix         matrix A
     /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
     /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
     /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
-
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    
     /// Get resulting vector x after linear solve, also includes post processing if necessary
     /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
     void get_result(double *x) override;
diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp
index 0c2bc6364..b63f5959b 100644
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@@ -51,6 +51,13 @@ BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
 
 template <unsigned int block_size>
 bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
+{
+    return analyze_matrix(mat, nullptr);
+}
+
+
+template <unsigned int block_size>
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
     const unsigned int bs = block_size;
 
@@ -62,19 +69,27 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
     std::vector<int> CSCRowIndices;
     std::vector<int> CSCColPointers;
 
+    auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat
+
     if (opencl_ilu_reorder == ILUReorder::NONE) {
         LUmat = std::make_unique<BlockedMatrix>(*mat);
     } else {
         toOrder.resize(Nb);
         fromOrder.resize(Nb);
-        CSCRowIndices.resize(nnzb);
+        CSCRowIndices.resize(matToDecompose->nnzbs);
         CSCColPointers.resize(Nb + 1);
         rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
-        LUmat = std::make_unique<BlockedMatrix>(*rmat);
+
+        if (jacMat) {
+            rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
+            LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
+        } else {
+            LUmat = std::make_unique<BlockedMatrix>(*rmat);
+        }
 
         Timer t_convert;
-        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
-        if (verbosity >= 3) {
+        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
+        if(verbosity >= 3){
             std::ostringstream out;
             out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
             OpmLog::info(out.str());
@@ -85,24 +100,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
     std::ostringstream out;
     if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
         out << "BILU0 reordering strategy: " << "level_scheduling\n";
-        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
+        if (jacMat) {
+            reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
+        }
     } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
         out << "BILU0 reordering strategy: " << "graph_coloring\n";
-        findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
+        if (jacMat) {
+            reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
+        }
     } else if (opencl_ilu_reorder == ILUReorder::NONE) {
         out << "BILU0 reordering strategy: none\n";
         // numColors = 1;
         // rowsPerColor.emplace_back(Nb);
         numColors = Nb;
-        for (int i = 0; i < Nb; ++i) {
+        for(int i = 0; i < Nb; ++i){
             rowsPerColor.emplace_back(1);
         }
     } else {
         OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
     }
-    if (verbosity >= 1) {
+    if(verbosity >= 1){
         out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
     }
+
 #if CHOW_PATEL
     out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
 #endif
@@ -112,8 +136,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
     invDiagVals.resize(mat->Nb * bs * bs);
 
 #if CHOW_PATEL
-    Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
-    Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
+    Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
 #endif
 
     s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
@@ -137,7 +161,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
 
     rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
     for (int i = 0; i < numColors; ++i) {
-        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
+        rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
     }
     err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
 
@@ -149,21 +173,38 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
     }
 
     return true;
-} // end init()
+}
+
 
 
 template <unsigned int block_size>
 bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    return create_preconditioner(mat, nullptr);
+}
+
+
+template <unsigned int block_size>
+bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
     const unsigned int bs = block_size;
-    auto *m = mat;
 
-    if (opencl_ilu_reorder != ILUReorder::NONE) {
-        m = rmat.get();
+    auto *matToDecompose = jacMat;
+
+    if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
+        matToDecompose = jacMat ? jacMat : mat;
+    } else {
         Timer t_reorder;
-        reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
+        if (jacMat) {
+            matToDecompose = rJacMat.get();
+            reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
+            reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
+        } else {
+            matToDecompose = rmat.get();
+            reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
+        }
 
-        if (verbosity >= 3) {
+        if (verbosity >= 3){
             std::ostringstream out;
             out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
             OpmLog::info(out.str());
@@ -173,18 +214,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
     // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
     // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
     Timer t_copy;
-    memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
+    memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
 
-    if (verbosity >= 3) {
+    if (verbosity >= 3){
         std::ostringstream out;
         out << "BILU0 memcpy: " << t_copy.stop() << " s";
         OpmLog::info(out.str());
     }
 
 #if CHOW_PATEL
-    chowPatelIlu.decomposition(queue.get(), context.get(),
+    chowPatelIlu.decomposition(queue, context,
                                LUmat.get(), Lmat.get(), Umat.get(),
-                               invDiagVals.data(), diagIndex,
+                               invDiagVals, diagIndex,
                                s.diagIndex, s.invDiagVals,
                                s.Lvals, s.Lcols, s.Lrows,
                                s.Uvals, s.Ucols, s.Urows);
@@ -192,23 +233,23 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
     Timer t_copyToGpu;
 
     events.resize(1);
-    err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
 
-    std::call_once(pattern_uploaded, [&]() {
+    std::call_once(pattern_uploaded, [&](){
         // find the positions of each diagonal block
         // must be done after reordering
         for (int row = 0; row < Nb; ++row) {
             int rowStart = LUmat->rowPointers[row];
-            int rowEnd = LUmat->rowPointers[row + 1];
+            int rowEnd = LUmat->rowPointers[row+1];
 
             auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
             assert(candidate != LUmat->colIndices + rowEnd);
             diagIndex[row] = candidate - LUmat->colIndices;
         }
         events.resize(4);
-        err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
-        err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
-        err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
+        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
+        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
     });
 
     cl::WaitForEvents(events);
@@ -229,8 +270,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
     cl::Event event;
     for (int color = 0; color < numColors; ++color) {
         const unsigned int firstRow = rowsPerColorPrefix[color];
-        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
-        if (verbosity >= 4) {
+        const unsigned int lastRow = rowsPerColorPrefix[color+1];
+        if (verbosity >= 5) {
             out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
         }
         OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
@@ -245,6 +286,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
     return true;
 } // end create_preconditioner()
 
+
 // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
diff --git a/opm/simulators/linalg/bda/opencl/BILU0.hpp b/opm/simulators/linalg/bda/opencl/BILU0.hpp
index b3b9ce125..c7d217eb0 100644
--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@@ -55,6 +55,7 @@ class BILU0 : public Preconditioner<block_size>
 private:
     std::unique_ptr<BlockedMatrix> LUmat = nullptr;
     std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
+    std::shared_ptr<BlockedMatrix> rJacMat = nullptr; 
 #if CHOW_PATEL
     std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
@@ -68,6 +69,9 @@ private:
 
     ILUReorder opencl_ilu_reorder;
 
+    std::vector<int> reordermappingNonzeroes;    // maps nonzero blocks to new location in reordered matrix
+    std::vector<int> jacReordermappingNonzeroes; // same but for jacMatrix
+
     typedef struct {
         cl::Buffer invDiagVals;
         cl::Buffer diagIndex;
@@ -92,9 +96,11 @@ public:
 
     // analysis, find reordering if specified
     bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
 
     // ilu_decomposition
     bool create_preconditioner(BlockedMatrix *mat) override;
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
 
     // apply preconditioner, x = prec(y)
     void apply(const cl::Buffer& y, cl::Buffer& x) override;
@@ -114,6 +120,11 @@ public:
         return rmat.get();
     }
 
+    BlockedMatrix* getRJacMat()
+    {
+        return rJacMat.get();
+    }
+
     std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
     {
         return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
@@ -127,7 +138,6 @@ public:
         return std::make_pair(s.LUvals, s.invDiagVals);
 #endif
     }
-
 };
 
 } // namespace Accelerator
diff --git a/opm/simulators/linalg/bda/opencl/BISAI.cpp b/opm/simulators/linalg/bda/opencl/BISAI.cpp
index 0febeebd4..695045ca5 100644
--- a/opm/simulators/linalg/bda/opencl/BISAI.cpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp
@@ -78,17 +78,30 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
 
 template <unsigned int block_size>
 bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
+{
+    return analyze_matrix(mat, nullptr);
+}
+
+template <unsigned int block_size>
+bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
     const unsigned int bs = block_size;
+    auto *m = mat;
 
-    this->N = mat->Nb * bs;
-    this->Nb = mat->Nb;
-    this->nnz = mat->nnzbs * bs * bs;
-    this->nnzb = mat->nnzbs;
+    if (jacMat) {
+        m = jacMat;
+    }
 
-    bilu0->analyze_matrix(mat);
+    this->N = m->Nb * bs;
+    this->Nb = m->Nb;
+    this->nnz = m->nnzbs * bs * bs;
+    this->nnzb = m->nnzbs;
 
-    return true;
+    if (jacMat) {
+        return bilu0->analyze_matrix(mat, jacMat);
+    } else {
+        return bilu0->analyze_matrix(mat);
+    }
 }
 
 template <unsigned int block_size>
@@ -162,7 +175,7 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
 }
 
 template <unsigned int block_size>
-bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
 {
     const unsigned int bs = block_size;
 
@@ -172,7 +185,11 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
 
     Dune::Timer t_preconditioner;
 
-    bilu0->create_preconditioner(mat);
+    if (jacMat) {
+        bilu0->create_preconditioner(mat, jacMat);
+    } else {
+        bilu0->create_preconditioner(mat);
+    }
 
     std::call_once(initialize, [&]() {
         std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure();
@@ -255,6 +272,12 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
     return true;
 }
 
+template <unsigned int block_size>
+bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+{
+    return create_preconditioner(mat, nullptr);
+}
+
 template <unsigned int block_size>
 void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
     const unsigned int bs = block_size;
diff --git a/opm/simulators/linalg/bda/opencl/BISAI.hpp b/opm/simulators/linalg/bda/opencl/BISAI.hpp
index c73b1fd9d..f0fe85242 100644
--- a/opm/simulators/linalg/bda/opencl/BISAI.hpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.hpp
@@ -117,9 +117,11 @@ public:
 
     // analysis, find reordering if specified
     bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
 
     // ilu_decomposition
     bool create_preconditioner(BlockedMatrix *mat) override;
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
 
     // apply preconditioner, x = prec(y)
     void apply(const cl::Buffer& y, cl::Buffer& x) override;
diff --git a/opm/simulators/linalg/bda/opencl/CPR.cpp b/opm/simulators/linalg/bda/opencl/CPR.cpp
index 8de00dab5..9ae847c99 100644
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@@ -80,6 +80,42 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
     return success;
 }
 
+template <unsigned int block_size>
+bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+    this->Nb = mat_->Nb;
+    this->nnzb = mat_->nnzbs;
+    this->N = Nb * block_size;
+    this->nnz = nnzb * block_size * block_size;
+
+    bool success = bilu0->analyze_matrix(mat_, jacMat);
+
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        mat = mat_;
+    } else {
+        mat = bilu0->getRMat();
+    }
+    return success;
+}
+
+template <unsigned int block_size>
+bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+    Dune::Timer t_bilu0;
+    bool result = bilu0->create_preconditioner(mat_, jacMat);
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Dune::Timer t_amg;
+    create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    return result;
+}
 
 template <unsigned int block_size>
 bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
@@ -101,6 +137,7 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
     return result;
 }
 
+
 // return the absolute value of the N elements for which the absolute value is highest
 double get_absmax(const double *data, const int N) {
     return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
diff --git a/opm/simulators/linalg/bda/opencl/CPR.hpp b/opm/simulators/linalg/bda/opencl/CPR.hpp
index 949148dd0..c81cee725 100644
--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@@ -125,6 +125,7 @@ public:
     CPR(int verbosity, ILUReorder opencl_ilu_reorder);
 
     bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
 
     // set own Opencl variables, but also that of the bilu0 preconditioner
     void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
@@ -134,7 +135,8 @@ public:
     void apply(const cl::Buffer& y, cl::Buffer& x) override;
 
     bool create_preconditioner(BlockedMatrix *mat) override;
-
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    
     int* getToOrder() override
     {
         return bilu0->getToOrder();
diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
index f7256039b..8bf670b8c 100644
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@@ -52,10 +52,21 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
     }
 }
 
+template <unsigned int block_size>
+bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+    return analyze_matrix(mat);
+}
+
+template <unsigned int block_size>
+bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+    return create_preconditioner(mat);
+}
 
 #define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);  \
-template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
+template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);   \
+template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
+template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *);                             \
+template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
 
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
index a288f952d..f0af75d25 100644
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@@ -68,10 +68,14 @@ public:
 
     // analyze matrix, e.g. the sparsity pattern
     // probably only called once
+    // the version with two params can be overloaded, if not, it will default to using the one param version
     virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
+    virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
 
     // create/update preconditioner, probably used every linear solve
+    // the version with two params can be overloaded, if not, it will default to using the one param version
     virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
+    virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
 
     // get reordering mappings
     virtual int* getToOrder() = 0;
diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
index 855631887..dec83ce4c 100644
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@@ -370,7 +370,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
             ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
         OpmLog::info(out.str());
     }
-    if (verbosity >= 4) {
+    if (verbosity >= 3) {
         std::ostringstream out;
         out << "openclSolver::prec_apply:  " << t_prec.elapsed() << " s\n";
         out << "wellContributions::apply:  " << t_well.elapsed() << " s\n";
@@ -383,14 +383,21 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
-    this->N = N_;
-    this->nnz = nnz_;
-    this->nnzb = nnz_ / block_size / block_size;
+void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+    this->Nb = matrix->Nb;
+    this->N = Nb * block_size;
+    this->nnzb = matrix->nnzbs;
+    this->nnz = nnzb * block_size * block_size;
+
+    if (jacMatrix) {
+        useJacMatrix = true;
+    }
 
-    Nb = (N + dim - 1) / dim;
     std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
+    if (useJacMatrix) {
+        out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
+    }
     out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
     out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
     OpmLog::info(out.str());
@@ -401,9 +408,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         prec->setOpencl(context, queue);
 
 #if COPY_ROW_BY_ROW
-        vals_contiguous.resize(nnz);
+        vals_contiguous = new double[N];
 #endif
-        mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
+        // mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
+        // jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
+        mat = matrix;
+        jacMat = jacMatrix;
 
         d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
         d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
@@ -441,7 +451,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
     initialized = true;
 } // end initialize()
 
-
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
     if (opencl_ilu_reorder != ILUReorder::NONE) {
@@ -527,8 +536,11 @@ template <unsigned int block_size>
 bool openclSolverBackend<block_size>::analyze_matrix() {
     Timer t;
 
-    // bool success = bilu0->init(mat.get());
-    bool success = prec->analyze_matrix(mat.get());
+    bool success;
+    if (useJacMatrix)
+        success = prec->analyze_matrix(mat.get(), jacMat.get());
+    else
+        success = prec->analyze_matrix(mat.get());
 
     if (opencl_ilu_reorder == ILUReorder::NONE) {
         rmat = mat.get();
@@ -579,7 +591,11 @@ template <unsigned int block_size>
 bool openclSolverBackend<block_size>::create_preconditioner() {
     Timer t;
 
-    bool result = prec->create_preconditioner(mat.get());
+    bool result;
+    if (useJacMatrix)
+        result = prec->create_preconditioner(mat.get(), jacMat.get());
+    else
+        result = prec->create_preconditioner(mat.get());
 
     if (verbosity > 2) {
         std::ostringstream out;
@@ -639,21 +655,26 @@ void openclSolverBackend<block_size>::get_result(double *x) {
 
 
 template <unsigned int block_size>
-SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
+                                                           double *b,
+                                                           std::shared_ptr<BlockedMatrix> jacMatrix,
+                                                           WellContributions& wellContribs,
+                                                           BdaResult &res)
+{
     if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize(matrix, jacMatrix);
         if (analysis_done == false) {
             if (!analyze_matrix()) {
                 return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
             }
         }
-        update_system(vals, b, wellContribs);
+        update_system(matrix->nnzValues, b, wellContribs);
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
         copy_system_to_gpu();
     } else {
-        update_system(vals, b, wellContribs);
+        update_system(matrix->nnzValues, b, wellContribs);
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
index d356520e6..cc9eb4fe0 100644
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@@ -63,12 +63,15 @@ private:
 
     std::vector<cl::Device> devices;
 
+    bool useJacMatrix = false;
+
     std::unique_ptr<Preconditioner<block_size> > prec;
                                                                   // can perform blocked ILU0 and AMG on pressure component
     bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
     bool analysis_done = false;
-    std::unique_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    std::shared_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    std::shared_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
     BlockedMatrix *rmat = nullptr;                                // reordered matrix (or original if no reordering), used for spmv
     ILUReorder opencl_ilu_reorder;                                // reordering strategy
     std::vector<cl::Event> events;
@@ -130,13 +133,9 @@ private:
     void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
-    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+    /// \param[in] matrix     matrix A
+    /// \param[in] jacMatrix  matrix for preconditioner
+    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
 
     /// Clean memory
     void finalize();
@@ -189,21 +188,14 @@ public:
     ~openclSolverBackend();
 
     /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
-    /// \param[in] dim            size of block
-    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols           array of columnIndices, contains nnz values
-    /// \param[in] vals_prec      array of nonzeroes for preconditioner
-    /// \param[in] rows_prec      array of rowPointers for preconditioner
-    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] matrix         matrix A
     /// \param[in] b              input vector, contains N values
+    /// \param[in] jacMatrix      matrix for preconditioner
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
     /// \return                   status code
-    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
+        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
 
     /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
     /// Data is already on the GPU
@@ -218,7 +210,7 @@ public:
     /// \param[in] context   the opencl context to be used
     /// \param[in] queue     the opencl queue to be used
     void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
-
+    
 }; // end class openclSolverBackend
 
 } // namespace Accelerator
diff --git a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
index c6314c54a..6076a088e 100644
--- a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
@@ -35,12 +35,6 @@ using Accelerator::OpenclKernels;
 void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
     this->context = context_;
     this->queue = queue_;
- }
-
-void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
-                                     Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
-    this->kernel = kernel_;
-    this->kernel_no_reorder = kernel_no_reorder_;
 }
 
 void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
diff --git a/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp b/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
index c2be1a843..82215563c 100644
--- a/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
@@ -35,8 +35,6 @@ namespace Opm
 class WellContributionsOCL : public WellContributions
 {
 public:
-    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
-                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
     void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
 
     /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
@@ -56,8 +54,6 @@ protected:
 
     cl::Context* context;
     cl::CommandQueue* queue;
-    Opm::Accelerator::stdwell_apply_kernel_type* kernel;
-    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
     std::vector<cl::Event> events;
 
     std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
diff --git a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
index e8d37237f..21d512172 100644
--- a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
+++ b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
@@ -39,9 +39,9 @@ namespace detail
     /// \param useWellConn Boolean that is true when UseWellContribusion is true
     /// \param wellGraph Cell IDs of well cells stored in a graph.
     template<class Grid, class W>
-    void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph)
+    void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph, int numJacobiBlocks)
     {
-        if ( grid.comm().size() > 1)
+        if ( grid.comm().size() > 1 || numJacobiBlocks > 1)
         {
             Dune::CartesianIndexMapper< Grid > cartMapper( grid );
             const int numCells = cartMapper.compressedSize(); // grid.numCells()
diff --git a/tests/test_cusparseSolver.cpp b/tests/test_cusparseSolver.cpp
index 6c533134e..132989cd2 100644
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@@ -116,7 +116,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
         bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);
         auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                             // matrix created by readMatrixMarket() did not have contiguous memory
-        bridge->solve_system(&mat2, rhs, *wellContribs, result);
+        bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
         bridge->get_result(x);
 
         return x;
diff --git a/tests/test_openclSolver.cpp b/tests/test_openclSolver.cpp
index 94709ce3c..7705d2ec8 100644
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@@ -118,7 +118,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, Vec
     }
     auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                         // matrix created by readMatrixMarket() did not have contiguous memory
-    bridge->solve_system(&mat2, rhs, *wellContribs, result);
+    bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
     bridge->get_result(x);
 
     return x;