From e360c00b73dca89be9f1716c9a704905576b7792 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Thu, 21 Apr 2022 17:18:32 +0200
Subject: [PATCH] add block-jacobi partitioner option. Add block-jacobi matrix
 for use in OpenCL preconditioner Rebased

---
 ebos/eclbasevanguard.hh                       |  11 +
 ebos/eclcpgridvanguard.hh                     |   3 +-
 ebos/eclgenericcpgridvanguard.cc              |  84 ++++---
 ebos/eclgenericcpgridvanguard.hh              |   8 +-
 ebos/eclgenericvanguard.hh                    |   7 +
 opm/simulators/linalg/ISTLSolverEbos.hpp      |  94 +++++++-
 opm/simulators/linalg/bda/BdaBridge.cpp       |  24 +-
 opm/simulators/linalg/bda/BdaBridge.hpp       |   2 +-
 opm/simulators/linalg/bda/BdaSolver.hpp       |   5 +
 .../linalg/bda/FPGASolverBackend.cpp          |   9 +-
 .../linalg/bda/FPGASolverBackend.hpp          |   5 +
 .../linalg/bda/amgclSolverBackend.cpp         |  10 +
 .../linalg/bda/amgclSolverBackend.hpp         |   5 +
 .../linalg/bda/cuda/cusparseSolverBackend.cu  |   9 +-
 .../linalg/bda/cuda/cusparseSolverBackend.hpp |   5 +
 opm/simulators/linalg/bda/opencl/BILU0.cpp    | 220 +++++++++++++++++-
 opm/simulators/linalg/bda/opencl/BILU0.hpp    |   5 +
 .../linalg/bda/opencl/Preconditioner.cpp      |  15 +-
 .../linalg/bda/opencl/Preconditioner.hpp      |   4 +
 .../linalg/bda/opencl/openclSolverBackend.cpp |  96 +++++++-
 .../linalg/bda/opencl/openclSolverBackend.hpp |  12 +-
 .../linalg/findOverlapRowsAndColumns.hpp      |   4 +-
 tests/test_openclSolver.cpp                   |   2 +-
 23 files changed, 582 insertions(+), 57 deletions(-)
diff --git a/ebos/eclbasevanguard.hh b/ebos/eclbasevanguard.hh
index 2b29f9282..30a3c1aa8 100644
--- a/ebos/eclbasevanguard.hh
+++ b/ebos/eclbasevanguard.hh
@@ -85,6 +85,10 @@ struct EdgeWeightsMethod {
     using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
+struct NumJacobiBlocks {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
 struct OwnerCellsFirst {
     using type = UndefinedProperty;
 };
@@ -133,6 +137,10 @@ struct EdgeWeightsMethod<TypeTag, TTag::EclBaseVanguard> {
     static constexpr int value = 1;
 };
 template<class TypeTag>
+struct NumJacobiBlocks<TypeTag, TTag::EclBaseVanguard> {
+    static constexpr int value = 0;
+};
+template<class TypeTag>
 struct OwnerCellsFirst<TypeTag, TTag::EclBaseVanguard> {
     static constexpr bool value = true;
 };
@@ -211,6 +219,8 @@ public:
                              "When restarting: should we try to initialize wells and groups from historical SCHEDULE section.");
         EWOMS_REGISTER_PARAM(TypeTag, int, EdgeWeightsMethod,
                              "Choose edge-weighing strategy: 0=uniform, 1=trans, 2=log(trans).");
+        EWOMS_REGISTER_PARAM(TypeTag, int, NumJacobiBlocks,
+                             "Number of blocks to be created for the Block-Jacobi preconditioner.");
         EWOMS_REGISTER_PARAM(TypeTag, bool, OwnerCellsFirst,
                              "Order cells owned by rank before ghost/overlap cells.");
         EWOMS_REGISTER_PARAM(TypeTag, bool, SerialPartitioning,
@@ -235,6 +245,7 @@ public:
     {
         fileName_ = EWOMS_GET_PARAM(TypeTag, std::string, EclDeckFileName);
         edgeWeightsMethod_   = Dune::EdgeWeightMethod(EWOMS_GET_PARAM(TypeTag, int, EdgeWeightsMethod));
+        numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
         ownersFirst_ = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
         serialPartitioning_ = EWOMS_GET_PARAM(TypeTag, bool, SerialPartitioning);
         zoltanImbalanceTol_ = EWOMS_GET_PARAM(TypeTag, double, ZoltanImbalanceTol);
diff --git a/ebos/eclcpgridvanguard.hh b/ebos/eclcpgridvanguard.hh
index c35fed0bd..d399327f1 100644
--- a/ebos/eclcpgridvanguard.hh
+++ b/ebos/eclcpgridvanguard.hh
@@ -136,7 +136,7 @@ public:
                              this->serialPartitioning(), this->enableDistributedWells(),
                              this->zoltanImbalanceTol(), this->gridView(),
                              this->schedule(), this->centroids_,
-                             this->eclState(), this->parallelWells_);
+                             this->eclState(), this->parallelWells_, this->numJacobiBlocks());
 #endif
 
         this->updateGridView_();
@@ -192,6 +192,7 @@ protected:
     }
 
     std::unique_ptr<TransmissibilityType> globalTrans_;
+    //std::vector<int> cell_part_;
 };
 
 } // namespace Opm
diff --git a/ebos/eclgenericcpgridvanguard.cc b/ebos/eclgenericcpgridvanguard.cc
index 27025a882..57ead7bcc 100644
--- a/ebos/eclgenericcpgridvanguard.cc
+++ b/ebos/eclgenericcpgridvanguard.cc
@@ -82,12 +82,13 @@ void EclGenericCpGridVanguard<ElementMapper,GridView,Scalar>::doLoadBalance_(Dun
                                                                              const Schedule& schedule,
                                                                              std::vector<double>& centroids,
                                                                              EclipseState& eclState1,
-                                                                             EclGenericVanguard::ParallelWellStruct& parallelWells)
+                                                                             EclGenericVanguard::ParallelWellStruct& parallelWells,
+                                                                             int numJacobiBlocks)
 {
     int mpiSize = 1;
     MPI_Comm_size(grid_->comm(), &mpiSize);
-
-    if (mpiSize > 1) {
+    
+    if (mpiSize > 1 || numJacobiBlocks > 0) {
         // the CpGrid's loadBalance() method likes to have the transmissibilities as
         // its edge weights. since this is (kind of) a layering violation and
         // transmissibilities are relatively expensive to compute, we only do it if
@@ -131,53 +132,60 @@ void EclGenericCpGridVanguard<ElementMapper,GridView,Scalar>::doLoadBalance_(Dun
         }
 
         //distribute the grid and switch to the distributed view.
-        {
-            const auto wells = schedule.getWellsatEnd();
-
-            try
+        if (mpiSize > 1) {
             {
-                auto& eclState = dynamic_cast<ParallelEclipseState&>(eclState1);
-                const EclipseGrid* eclGrid = nullptr;
+                const auto wells = schedule.getWellsatEnd();
 
-                if (grid_->comm().rank() == 0)
+                try
                 {
-                    eclGrid = &eclState.getInputGrid();
-                }
+                    auto& eclState = dynamic_cast<ParallelEclipseState&>(eclState1);
+                    const EclipseGrid* eclGrid = nullptr;
 
-                PropsCentroidsDataHandle<Dune::CpGrid> handle(*grid_, eclState, eclGrid, centroids,
-                                                              cartesianIndexMapper());
-                if (loadBalancerSet)
-                {
-                    std::vector<int> parts;
                     if (grid_->comm().rank() == 0)
                     {
-                        parts =  (*externalLoadBalancer)(*grid_);
+                        eclGrid = &eclState.getInputGrid();
                     }
-                    parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1));
-                }
-                else
-                {
-                    parallelWells =
-                        std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning,
-                                                       faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol,
-                                                       enableDistributedWells));
-                }
-            }
-            catch(const std::bad_cast& e)
-            {
-                std::ostringstream message;
-                message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState ("
-                        << e.what() <<")"<<std::flush;
-                OpmLog::error(message.str());
-                std::rethrow_exception(std::current_exception());
-            }
-        }
-        grid_->switchToDistributedView();
 
+                    PropsCentroidsDataHandle<Dune::CpGrid> handle(*grid_, eclState, eclGrid, centroids,
+                                                                  cartesianIndexMapper());
+                    if (loadBalancerSet)
+                    {
+                        std::vector<int> parts;
+                        if (grid_->comm().rank() == 0)
+                        {
+                         parts =  (*externalLoadBalancer)(*grid_);
+                        }
+                        parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1));
+                    }
+                    else
+                    {
+                        parallelWells =
+                            std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning,
+                                                           faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol,
+                                                           enableDistributedWells));
+                    }
+                }
+                catch(const std::bad_cast& e)
+                {
+                    std::ostringstream message;
+                    message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState ("
+                            << e.what() <<")"<<std::flush;
+                    OpmLog::error(message.str());
+                    std::rethrow_exception(std::current_exception());
+                }
+            }
+            grid_->switchToDistributedView();
+        }
+        
         // Calling Schedule::filterConnections would remove any perforated
         // cells that exist only on other ranks even in the case of distributed wells
         // But we need all connections to figure out the first cell of a well (e.g. for
         // pressure). Hence this is now skipped. Rank 0 had everything even before.
+        if (numJacobiBlocks > 0 && mpiSize == 1) {
+            const auto wells = schedule.getWellsatEnd();
+            cell_part_.resize(grid_->numCells());
+            cell_part_ = grid_->zoltanPartitionWithoutScatter(&wells, faceTrans.data(), numJacobiBlocks, zoltanImbalanceTol);
+        }
     }
 }
 
diff --git a/ebos/eclgenericcpgridvanguard.hh b/ebos/eclgenericcpgridvanguard.hh
index 310c2a14c..13a7172e0 100644
--- a/ebos/eclgenericcpgridvanguard.hh
+++ b/ebos/eclgenericcpgridvanguard.hh
@@ -101,6 +101,10 @@ public:
      */
     const CartesianIndexMapper& equilCartesianIndexMapper() const;
 
+    std::vector<int> cellPartition() const
+    {
+        return cell_part_;
+    }
 protected:
     /*!
      * \brief Distribute the simulation grid over multiple processes
@@ -114,7 +118,8 @@ protected:
                         const GridView& gridv, const Schedule& schedule,
                         std::vector<double>& centroids,
                         EclipseState& eclState,
-                        EclGenericVanguard::ParallelWellStruct& parallelWells);
+                        EclGenericVanguard::ParallelWellStruct& parallelWells,
+                        int numJacobiBlocks);
 
     void distributeFieldProps_(EclipseState& eclState);
 #endif
@@ -137,6 +142,7 @@ protected:
     std::unique_ptr<CartesianIndexMapper> equilCartesianIndexMapper_;
 
     int mpiRank;
+    std::vector<int> cell_part_;
 };
 
 } // namespace Opm
diff --git a/ebos/eclgenericvanguard.hh b/ebos/eclgenericvanguard.hh
index 05b441584..200ecb508 100644
--- a/ebos/eclgenericvanguard.hh
+++ b/ebos/eclgenericvanguard.hh
@@ -249,6 +249,12 @@ public:
     Dune::EdgeWeightMethod edgeWeightsMethod() const
     { return edgeWeightsMethod_; }
 
+    /*!
+     * \brief Number of blocks in the Block-Jacobi preconditioner.
+     */
+    int numJacobiBlocks() const
+    { return numJacobiBlocks_; }
+
     /*!
      * \brief Parameter that decide if cells owned by rank are ordered before ghost cells.
      */
@@ -323,6 +329,7 @@ protected:
     std::string caseName_;
     std::string fileName_;
     Dune::EdgeWeightMethod edgeWeightsMethod_;
+    int numJacobiBlocks_;
     bool ownersFirst_;
     bool serialPartitioning_;
     double zoltanImbalanceTol_;
diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp
index a77737b87..6fade71a4 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -161,7 +161,15 @@ namespace Opm
             // Set it up manually
             ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout());
             detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
-
+	    numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
+	    useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+	    if (numJacobiBlocks_ > 1) {
+		const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
+		detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_,
+					   wellConnectionsGraph_, numJacobiBlocks_);
+		std::cout << "Create block-Jacobi pattern" << std::endl;
+		blockJacobiAdjacency();
+	    }
             useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
 #if HAVE_FPGA
             // check usage of MatrixAddWellContributions: for FPGA they must be included
@@ -273,8 +281,15 @@ namespace Opm
                 }
 #endif
 
+		if (numJacobiBlocks_ > 1) {
+		    copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_);
                 // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
+		    bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), &*blockJacobiForGPUILU0_,
+					    numJacobiBlocks_, *rhs_, *wellContribs, result);
+		}
+		else
+		    bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), const_cast<Matrix*>(&getMatrix()),
+					    numJacobiBlocks_, *rhs_, *wellContribs, result);
                 if (result.converged) {
                     // get result vector x from non-Dune backend, iff solve was successful
                     bdaBridge->get_result(x);
@@ -497,6 +512,78 @@ namespace Opm
                 }
         }
 
+	/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
+	void blockJacobiAdjacency()
+	{
+	    const auto& grid = simulator_.vanguard().grid();
+	    std::vector<int> cell_part = simulator_.vanguard().cellPartition();
+
+	    typedef typename Matrix::size_type size_type;
+	    size_type numCells = grid.size( 0 );
+	    blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, Matrix::random));
+
+	    std::vector<std::set<size_type>> pattern;
+	    pattern.resize(numCells);
+
+	    const auto& lid = grid.localIdSet();
+	    const auto& gridView = grid.leafGridView();
+	    auto elemIt = gridView.template begin<0>();
+	    const auto& elemEndIt = gridView.template end<0>();
+
+	    //Loop over cells
+	    for (; elemIt != elemEndIt; ++elemIt)
+            {
+
+		const auto& elem = *elemIt;
+		size_type idx = lid.id(elem);
+		pattern[idx].insert(idx);
+
+		// Add well non-zero connections
+		for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc)
+		    pattern[idx].insert(*wc);
+
+		int locPart = cell_part[idx];
+
+		//Add neighbor if it is on the same part
+		auto isend = gridView.iend(elem);
+		for (auto is = gridView.ibegin(elem); is!=isend; ++is)
+                {
+		    //check if face has neighbor
+		    if (is->neighbor())
+                    {
+			size_type nid = lid.id(is->outside());
+			int nabPart = cell_part[nid];
+			if (locPart == nabPart) 
+			    pattern[idx].insert(nid);
+		    }
+
+		    blockJacobiForGPUILU0_->setrowsize(idx, pattern[idx].size());
+		}
+	    }
+	    blockJacobiForGPUILU0_->endrowsizes();
+	    for (size_type dofId = 0; dofId < numCells; ++dofId)
+	    {
+		auto nabIdx = pattern[dofId].begin();
+		auto endNab = pattern[dofId].end();
+		for (; nabIdx != endNab; ++nabIdx)
+		{
+		    blockJacobiForGPUILU0_->addindex(dofId, *nabIdx);
+		}
+	    }
+
+	    blockJacobiForGPUILU0_->endindices();
+	}
+
+	void copyMatToBlockJac(Matrix& mat, Matrix& blockJac)
+	{
+	    auto rbegin = blockJac.begin();
+	    auto rend = blockJac.end();
+	    for (auto row = rbegin; row != rend; ++row) {
+		for (auto col = (*row).begin(); col != (*row).end(); ++col) {
+		    blockJac[row.index()][col.index()] = mat[row.index()][col.index()];
+		}
+	    }
+	}
 
         Matrix& getMatrix()
         {
@@ -517,6 +604,8 @@ namespace Opm
         Matrix* matrix_;
         Vector *rhs_;
 
+        std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
+
         std::unique_ptr<FlexibleSolverType> flexibleSolver_;
         std::unique_ptr<AbstractOperatorType> linearOperatorForFlexibleSolver_;
         std::unique_ptr<WellModelAsLinearOperator<WellModel, Vector, Vector>> wellOperator_;
@@ -530,6 +619,7 @@ namespace Opm
         FlowLinearSolverParameters parameters_;
         PropertyTree prm_;
         bool scale_variables_;
+	int numJacobiBlocks_;
 
         std::shared_ptr< CommunicationType > comm_;
     }; // end ISTLSolver
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index ee7c6e7c8..11080b99c 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -193,6 +193,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
 
 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unused]] BridgeMatrix* mat,
+                                                                     [[maybe_unused]] BridgeMatrix* blockMat,
+                                                                     [[maybe_unused]] int numJacobiBlocks,
                                                                      [[maybe_unused]] BridgeVector& b,
                                                                      [[maybe_unused]] WellContributions& wellContribs,
                                                                      [[maybe_unused]] InverseOperatorResult& res)
@@ -209,6 +211,11 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
         const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
         const int nnz = nnzb * dim * dim;
 
+        static std::vector<int> bm_h_rows;
+        static std::vector<int> bm_h_cols;
+        const int bm_nnzb = (bm_h_rows.empty()) ? blockMat->nonzeroes() : bm_h_rows.back();
+        const int bm_nnz = bm_nnzb * dim * dim;
+        
         if (dim != 3) {
             OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
             use_gpu = use_fpga = false;
@@ -221,8 +228,15 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
             copySparsityPatternFromISTL(*mat, h_rows, h_cols);
         }
 
+        if (bm_h_rows.capacity() == 0) {
+            bm_h_rows.reserve(Nb+1);
+            bm_h_cols.reserve(bm_nnzb);
+            copySparsityPatternFromISTL(*blockMat, bm_h_rows, bm_h_cols);
+        }
+
         Dune::Timer t_zeros;
         int numZeros = checkZeroDiagonal(*mat);
+        int bm_numZeros = checkZeroDiagonal(*blockMat);
         if (verbosity >= 2) {
             std::ostringstream out;
             out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
@@ -232,9 +246,17 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
 
         /////////////////////////
         // actually solve
+        SolverStatus status; 
+        if (numJacobiBlocks > 1)
+            status = backend->solve_system2(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])),
+                                            bm_nnz, static_cast<double*>(&(((*blockMat)[0][0][0][0]))), bm_h_rows.data(), bm_h_cols.data(),
+                                            wellContribs, result);
+        else
+            status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
 
+            
         // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
-        SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
+        //SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
         switch(status) {
         case SolverStatus::BDA_SOLVER_SUCCESS:
             //OpmLog::info("BdaSolver converged");
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index 3d24051bb..29e09aa46 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -65,7 +65,7 @@ public:
     /// \param[in] b            vector b, should be of type Dune::BlockVector
     /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] result    summary of solver result
-    void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
+    void solve_system(BridgeMatrix *mat, BridgeMatrix *blockMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
 
     /// Get the resulting x vector
     /// \param[inout] x    vector x, should be of type Dune::BlockVector
diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp
index d84652d28..7a65d458b 100644
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@@ -89,6 +89,11 @@ namespace Accelerator {
             double *vals, int *rows, int *cols,
             double *b, WellContributions& wellContribs, BdaResult &res) = 0;
 
+	virtual SolverStatus solve_system2(int N_, int nnz_, int dim,
+					   double *vals, int *rows, int *cols, double *b,
+					   int nnz2, double *vals2, int *rows2, int *cols2,
+					   WellContributions& wellContribs, BdaResult &res) = 0;
+
         virtual void get_result(double *x) = 0;
 
     }; // end class BdaSolver
diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
index 9087d4074..6ce712a9d 100644
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@@ -223,7 +223,6 @@ void FpgaSolverBackend<block_size>::get_result(double *x_)
     }
 } // end get_result()
 
-
 template <unsigned int block_size>
 SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res)
 {
@@ -250,6 +249,14 @@ SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int d
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
 
+template <unsigned int block_size>
+SolverStatus FpgaSolverBackend<block_size>::solve_system2(int N_, int nnz_, int dim,
+					   double *vals, int *rows, int *cols, double *b,
+					   int nnz2, double *vals2, int *rows2, int *cols2,
+					   WellContributions& wellContribs, BdaResult &res)
+{
+    return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+}
 
 template <unsigned int block_size>
 void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.hpp b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
index 4d1e766f3..12293e2e7 100644
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@@ -255,6 +255,11 @@ public:
     /// \return                   status code
     SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
 
+    SolverStatus solve_system2(int N_, int nnz_, int dim,
+			       double *vals, int *rows, int *cols, double *b,
+			       int nnz2, double *vals2, int *rows2, int *cols2,
+			       WellContributions& wellContribs, BdaResult &res) override;
+    
     /// Get result after linear solve, and peform postprocessing if necessary
     /// \param[inout] x           resulting x vector, caller must guarantee that x points to a valid array
     void get_result(double *x) override;
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cpp b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
index 957575fbf..859320163 100644
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@@ -359,6 +359,15 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
 } // end get_result()
 
 
+template <unsigned int block_size>
+SolverStatus amgclSolverBackend<block_size>::solve_system2(int N_, int nnz_, int dim,
+							   double *vals, int *rows, int *cols, double *b,
+							   int nnz2, double *vals2, int *rows2, int *cols2,
+							   WellContributions& wellContribs, BdaResult &res)
+{
+    return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+}
+    
 template <unsigned int block_size>
 SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) {
     if (initialized == false) {
@@ -371,6 +380,7 @@ SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
 }
 
 
+
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                                \
 template amgclSolverBackend<n>::amgclSolverBackend(int, int, double, unsigned int, unsigned int);   \
 
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.hpp b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
index 930f81ff9..95f3c928f 100644
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@@ -145,6 +145,11 @@ public:
     /// \return                   status code
     SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
 
+    SolverStatus solve_system2(int N_, int nnz_, int dim,
+			       double *vals, int *rows, int *cols, double *b,
+			       int nnz2, double *vals2, int *rows2, int *cols2,
+			       WellContributions& wellContribs, BdaResult &res) override;
+    
     /// Get result after linear solve, and peform postprocessing if necessary
     /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
     void get_result(double *x) override;
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
index 46f345784..7439e39c5 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -500,7 +500,14 @@ SolverStatus cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int
     }
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
-
+template <unsigned int block_size>
+SolverStatus cusparseSolverBackend<block_size>::solve_system2(int N_, int nnz_, int dim,
+					   double *vals, int *rows, int *cols, double *b,
+					   int nnz2, double *vals2, int *rows2, int *cols2,
+					   WellContributions& wellContribs, BdaResult &res)
+{
+    return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+}
 
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                       \
 template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int);  \
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
index 791bf32fc..8b05e1a8a 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@@ -138,6 +138,11 @@ public:
     /// \return                   status code
     SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
 
+    SolverStatus solve_system2(int N_, int nnz_, int dim,
+			       double *vals, int *rows, int *cols, double *b,
+			       int nnz2, double *vals2, int *rows2, int *cols2,
+			       WellContributions& wellContribs, BdaResult &res) override;
+    
     /// Get resulting vector x after linear solve, also includes post processing if necessary
     /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
     void get_result(double *x) override;
diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp
index 0c2bc6364..f10d025c8 100644
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@@ -149,7 +149,127 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
     }
 
     return true;
-} // end init()
+}
+
+
+template <unsigned int block_size>
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
+{
+    const unsigned int bs = block_size;
+
+    this->N = mat->Nb * block_size;
+    this->Nb = mat->Nb;
+    this->nnz = mat->nnzbs * block_size * block_size;
+    this->nnzb = mat->nnzbs;
+
+    this->nnz_jm = jacMat->nnzbs * block_size * block_size;
+    this->nnzbs_jm = jacMat->nnzbs;
+
+    int *CSCRowIndices = nullptr;
+    int *CSCColPointers = nullptr;
+
+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        LUmat = std::make_unique<BlockedMatrix>(*mat);
+    } else {
+        toOrder.resize(Nb);
+        fromOrder.resize(Nb);
+        CSCRowIndices = new int[nnzbs_jm];
+        CSCColPointers = new int[Nb + 1];
+        rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
+        rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
+        LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
+        
+        Timer t_convert;
+        csrPatternToCsc(jacMat->colIndices, jacMat->rowPointers, CSCRowIndices, CSCColPointers, jacMat->Nb);
+        if(verbosity >= 3){
+        std::ostringstream out;
+        out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+        OpmLog::info(out.str());
+        }
+    }
+
+    Timer t_analysis;
+    std::ostringstream out;
+    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
+        out << "BILU0 reordering strategy: " << "level_scheduling\n";
+        findLevelScheduling(jacMat->colIndices, jacMat->rowPointers, CSCRowIndices, CSCColPointers, jacMat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        for (int iii = 0; iii < numColors; ++iii) { out << "rpc: "<< rowsPerColor[iii] << "\n";} 
+        out << "numColors: "<< numColors << "\n";
+    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
+        out << "BILU0 reordering strategy: " << "graph_coloring\n";
+        findGraphColoring<block_size>(jacMat->colIndices, jacMat->rowPointers, CSCRowIndices, CSCColPointers, jacMat->Nb, jacMat->Nb, jacMat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        for (int iii = 0; iii < numColors; ++iii) { out << "rpc: "<< rowsPerColor[iii] << "\n";} 
+        out << "numColors: "<< numColors << "\n";
+    } else if (opencl_ilu_reorder == ILUReorder::NONE) {
+        out << "BILU0 reordering strategy: none\n";
+        // numColors = 1;
+        // rowsPerColor.emplace_back(Nb);
+        numColors = Nb;
+        for(int i = 0; i < Nb; ++i){
+        rowsPerColor.emplace_back(1);
+        }
+    } else {
+        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
+    }
+    if(verbosity >= 1){
+        out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
+    }
+#if CHOW_PATEL
+    out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
+#endif
+    OpmLog::info(out.str());
+
+
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        delete[] CSCRowIndices;
+        delete[] CSCColPointers;
+    }
+
+    diagIndex.resize(mat->Nb);
+    invDiagVals.resize(mat->Nb * bs * bs);
+
+#if CHOW_PATEL
+    Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+#endif
+
+    LUmat->nnzValues = new double[jacMat->nnzbs * bs * bs];
+
+    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
+    s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+    s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
+#if CHOW_PATEL
+    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+    s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+    s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+#else
+    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+    s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
+    s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
+#endif
+
+    events.resize(2);
+    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
+
+    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
+    for (int i = 0; i < numColors; ++i) {
+        rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
+    }
+    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }
+
+    return true;
+}
+
 
 
 template <unsigned int block_size>
@@ -245,6 +365,104 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
     return true;
 } // end create_preconditioner()
 
+
+template <unsigned int block_size>
+bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
+{
+    const unsigned int bs = block_size;
+    auto *m = mat;
+    auto *jm = jacMat;
+
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
+        m = rmat.get();
+        jm = rJacMat.get();
+        Timer t_reorder;
+        reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
+        reorderBlockedMatrixByPattern(jacMat, toOrder.data(), fromOrder.data(), rJacMat.get());
+
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
+            OpmLog::info(out.str());
+        }
+    }
+
+    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+    // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
+    Timer t_copy;
+    memcpy(LUmat->nnzValues, jm->nnzValues, sizeof(double) * bs * bs * jm->nnzbs);
+
+    if (verbosity >= 3){
+        std::ostringstream out;
+        out << "BILU0 memcpy: " << t_copy.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+#if CHOW_PATEL
+    chowPatelIlu.decomposition(queue, context,
+        LUmat.get(), Lmat.get(), Umat.get(),
+        invDiagVals, diagIndex,
+        s.diagIndex, s.invDiagVals,
+        s.Lvals, s.Lcols, s.Lrows,
+        s.Uvals, s.Ucols, s.Urows);
+#else
+    Timer t_copyToGpu;
+
+    events.resize(1);
+    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+
+    std::call_once(pattern_uploaded, [&](){
+        // find the positions of each diagonal block
+        // must be done after reordering
+        for (int row = 0; row < Nb; ++row) {
+            int rowStart = LUmat->rowPointers[row];
+            int rowEnd = LUmat->rowPointers[row+1];
+
+            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+            assert(candidate != LUmat->colIndices + rowEnd);
+            diagIndex[row] = candidate - LUmat->colIndices;
+        }
+        events.resize(4);
+        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
+        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
+        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+    });
+
+    cl::WaitForEvents(events);
+    events.clear();
+    if (err != CL_SUCCESS) {
+        // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
+        OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
+    }
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    Timer t_decomposition;
+    std::ostringstream out;
+    cl::Event event;
+    for (int color = 0; color < numColors; ++color) {
+        const unsigned int firstRow = rowsPerColorPrefix[color];
+        const unsigned int lastRow = rowsPerColorPrefix[color+1];
+        if (verbosity >= 4) {
+            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+        }
+        OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
+    }
+
+    if (verbosity >= 3) {
+        out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+        OpmLog::info(out.str());
+    }
+#endif // CHOW_PATEL
+
+    return true;
+} // end create_preconditioner()
+
+
 // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
diff --git a/opm/simulators/linalg/bda/opencl/BILU0.hpp b/opm/simulators/linalg/bda/opencl/BILU0.hpp
index b3b9ce125..e2a6b1a52 100644
--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@@ -53,8 +53,11 @@ class BILU0 : public Preconditioner<block_size>
     using Base::err;
 
 private:
+    int nnz_jm;     // number of nonzeroes of the matrix (scalar)
+    int nnzbs_jm;   // number of blocks of the matrix
     std::unique_ptr<BlockedMatrix> LUmat = nullptr;
     std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
+    std::shared_ptr<BlockedMatrix> rJacMat = nullptr; 
 #if CHOW_PATEL
     std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
@@ -92,9 +95,11 @@ public:
 
     // analysis, find reordering if specified
     bool analyze_matrix(BlockedMatrix *mat) override;
+    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
 
     // ilu_decomposition
     bool create_preconditioner(BlockedMatrix *mat) override;
+    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
 
     // apply preconditioner, x = prec(y)
     void apply(const cl::Buffer& y, cl::Buffer& x) override;
diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
index f7256039b..8bf670b8c 100644
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@@ -52,10 +52,21 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
     }
 }
 
+template <unsigned int block_size>
+bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+    return analyze_matrix(mat);
+}
+
+template <unsigned int block_size>
+bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+    return create_preconditioner(mat);
+}
 
 #define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);  \
-template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
+template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder);   \
+template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
+template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *);                             \
+template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
 
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
index a288f952d..f0af75d25 100644
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@@ -68,10 +68,14 @@ public:
 
     // analyze matrix, e.g. the sparsity pattern
     // probably only called once
+    // the version with two params can be overloaded, if not, it will default to using the one param version
     virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
+    virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
 
     // create/update preconditioner, probably used every linear solve
+    // the version with two params can be overloaded, if not, it will default to using the one param version
     virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
+    virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
 
     // get reordering mappings
     virtual int* getToOrder() = 0;
diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
index 855631887..20ecdde97 100644
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@@ -441,6 +441,69 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
     initialized = true;
 } // end initialize()
 
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::initialize2(int N_, int nnz_, int dim, double *vals, int *rows, int *cols,
+                                                  int nnz2, double *vals2, int *rows2, int *cols2) {
+    this->N = N_;
+    this->nnz = nnz_;
+    this->nnzb = nnz_ / block_size / block_size;
+
+    this->jac_nnz = nnz2;
+    this->jac_nnzb = nnz2 / block_size / block_size;
+    
+    Nb = (N + dim - 1) / dim;
+    std::ostringstream out;
+    out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
+    OpmLog::info(out.str());
+    out.str("");
+    out.clear();
+
+    try {
+        prec->setOpencl(context, queue);
+
+#if COPY_ROW_BY_ROW
+        vals_contiguous = new double[N];
+#endif
+        mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
+        jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
+
+        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+
+        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
+        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
+        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
+
+        bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
+        if (reorder) {
+            rb = new double[N];
+            d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
+        }
+
+    } catch (const cl::Error& error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err());
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (const std::logic_error& error) {
+        // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
+        throw error;
+    }
+
+    initialized = true;
+} // end initialize()
 
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
@@ -527,8 +590,7 @@ template <unsigned int block_size>
 bool openclSolverBackend<block_size>::analyze_matrix() {
     Timer t;
 
-    // bool success = bilu0->init(mat.get());
-    bool success = prec->analyze_matrix(mat.get());
+    bool success = prec->analyze_matrix(mat.get(), jacMat.get());
 
     if (opencl_ilu_reorder == ILUReorder::NONE) {
         rmat = mat.get();
@@ -579,7 +641,7 @@ template <unsigned int block_size>
 bool openclSolverBackend<block_size>::create_preconditioner() {
     Timer t;
 
-    bool result = prec->create_preconditioner(mat.get());
+    bool result = prec->create_preconditioner(mat.get(), jacMat.get());
 
     if (verbosity > 2) {
         std::ostringstream out;
@@ -662,7 +724,33 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
     solve_system(wellContribs, res);
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
-
+template <unsigned int block_size>
+SolverStatus openclSolverBackend<block_size>::solve_system2(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b,
+                           int nnz2, double *vals2, int *rows2, int *cols2,
+                           WellContributions& wellContribs, BdaResult &res)
+{
+    if (initialized == false) {
+        initialize2(N_, nnz_,  dim, vals, rows, cols, nnz2, vals2, rows2, cols2);
+        if (analysis_done == false) {
+            if (!analyze_matrix()) {
+                return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+            }
+        }
+        update_system(vals, b, wellContribs);
+        if (!create_preconditioner()) {
+            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        copy_system_to_gpu();
+    } else {
+        update_system(vals, b, wellContribs);
+        if (!create_preconditioner()) {
+            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        update_system_on_gpu();
+    }
+    solve_system(wellContribs, res);
+    return SolverStatus::BDA_SOLVER_SUCCESS;
+}
 
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
 template openclSolverBackend<n>::openclSolverBackend(                       \
diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
index d356520e6..55cc42d2e 100644
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@@ -63,12 +63,15 @@ private:
 
     std::vector<cl::Device> devices;
 
+    int jac_nnz;
+    int jac_nnzb;
     std::unique_ptr<Preconditioner<block_size> > prec;
                                                                   // can perform blocked ILU0 and AMG on pressure component
     bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
     bool analysis_done = false;
     std::unique_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    std::unique_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
     BlockedMatrix *rmat = nullptr;                                // reordered matrix (or original if no reordering), used for spmv
     ILUReorder opencl_ilu_reorder;                                // reordering strategy
     std::vector<cl::Event> events;
@@ -138,6 +141,9 @@ private:
     /// \param[in] cols           array of columnIndices, contains nnz values
     void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
 
+    void initialize2(int N, int nnz, int dim, double *vals, int *rows, int *cols,
+		     int nnz2, double *vals2, int *rows2, int *cols2);
+
     /// Clean memory
     void finalize();
 
@@ -205,6 +211,10 @@ public:
     /// \return                   status code
     SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
 
+    SolverStatus solve_system2(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b,
+                   int nnz2, double *vals2, int *rows2, int *cols2,
+                   WellContributions& wellContribs, BdaResult &res) override;
+
     /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
     /// Data is already on the GPU
     // SolverStatus solve_system(BdaResult &res);
@@ -218,7 +228,7 @@ public:
     /// \param[in] context   the opencl context to be used
     /// \param[in] queue     the opencl queue to be used
     void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
-
+    
 }; // end class openclSolverBackend
 
 } // namespace Accelerator
diff --git a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
index e8d37237f..21d512172 100644
--- a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
+++ b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp
@@ -39,9 +39,9 @@ namespace detail
     /// \param useWellConn Boolean that is true when UseWellContribusion is true
     /// \param wellGraph Cell IDs of well cells stored in a graph.
     template<class Grid, class W>
-    void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph)
+    void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph, int numJacobiBlocks)
     {
-        if ( grid.comm().size() > 1)
+        if ( grid.comm().size() > 1 || numJacobiBlocks > 1)
         {
             Dune::CartesianIndexMapper< Grid > cartMapper( grid );
             const int numCells = cartMapper.compressedSize(); // grid.numCells()
diff --git a/tests/test_openclSolver.cpp b/tests/test_openclSolver.cpp
index 94709ce3c..edfa6b3ae 100644
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@@ -118,7 +118,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, Vec
     }
     auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                         // matrix created by readMatrixMarket() did not have contiguous memory
-    bridge->solve_system(&mat2, rhs, *wellContribs, result);
+    bridge->solve_system(&mat2, &mat2, 0, rhs, *wellContribs, result);
     bridge->get_result(x);
 
     return x;