From 581cbc6a3e4eebf3f12820845b7eb1d58c6ea9bc Mon Sep 17 00:00:00 2001
From: "T.D. (Tongdong) Qiu" <tqiu@ce-cuda01.ewi.tudelft.nl>
Date: Fri, 13 Mar 2020 14:21:59 +0100
Subject: [PATCH] cusparseSolver can now apply wellcontributions separately, so
 --matrix-add-wellcontributions=true is not required anymore

---
 CMakeLists_files.cmake                        |   1 +
 opm/simulators/linalg/ISTLSolverEbos.hpp      |  14 +-
 opm/simulators/linalg/bda/BdaBridge.cpp       |   9 +-
 opm/simulators/linalg/bda/BdaBridge.hpp       |  11 +-
 .../linalg/bda/WellContributions.cu           | 420 ++++++++++++++++++
 .../linalg/bda/WellContributions.hpp          | 136 ++++++
 .../linalg/bda/cusparseSolverBackend.cu       | 116 +++--
 .../linalg/bda/cusparseSolverBackend.hpp      |  45 +-
 opm/simulators/wells/BlackoilWellModel.hpp    |   3 +
 .../wells/BlackoilWellModel_impl.hpp          |  20 +-
 opm/simulators/wells/StandardWell.hpp         |   6 +
 opm/simulators/wells/StandardWell_impl.hpp    |  59 +++
 12 files changed, 763 insertions(+), 77 deletions(-)
 create mode 100644 opm/simulators/linalg/bda/WellContributions.cu
 create mode 100644 opm/simulators/linalg/bda/WellContributions.hpp

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index ac42c705a..389c72fef 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -45,6 +45,7 @@ list (APPEND MAIN_SOURCE_FILES
 
 if(CUDA_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
 endif()
 
 # originally generated with the command:
diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp
index b3ce39320..08d1b022b 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -240,11 +240,7 @@ protected:
             const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
             const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
             const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
-            const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
             const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
-            if (use_gpu && !matrix_add_well_contributions) {
-                OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
-            }
             bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
 #else
             const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
@@ -442,14 +438,20 @@ protected:
                 // tries to solve linear system
                 // solve_system() does nothing if Dune is selected
 #if HAVE_CUDA
-                bdaBridge->solve_system(matrix_.get(), istlb, result);
+                WellContributions wellContribs;
+                const bool addWellContribs = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+                const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+                if(!addWellContribs && use_gpu)
+                {
+                    simulator_.problem().wellModel().getWellContributions(wellContribs);
+                }
+                bdaBridge->solve_system(matrix_.get(), istlb, wellContribs, result);
 
                 if (result.converged) {
                     // get result vector x from non-Dune backend, iff solve was successful
                     bdaBridge->get_result(x);
                 } else {
                     // CPU fallback, or default case for Dune
-                    const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
                     if (use_gpu) {
                         OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
                     }
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index a78eb3084..3cae84b08 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -41,6 +41,7 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, doub
 {
     if (use_gpu) {
         backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+        WellContributions::setMode(use_gpu);
     }
 }
 #else
@@ -121,7 +122,7 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector
 #endif
 
 template <class BridgeMatrix, class BridgeVector>
-void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
+void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
 {
 
 #if HAVE_CUDA
@@ -169,7 +170,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
 
         typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
         // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
-        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
+        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
         switch(status) {
         case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
             //OpmLog::info("cusparseSolver converged");
@@ -210,6 +211,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock
 Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat, \
     Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b, \
+    WellContributions& wellContribs, \
     InverseOperatorResult &res);
 
 template void BdaBridge::solve_system< \
@@ -217,6 +219,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock
 Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
     Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
+    WellContributions& wellContribs, \
     InverseOperatorResult &res);
 
 template void BdaBridge::solve_system< \
@@ -224,6 +227,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock
 Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
     Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
+    WellContributions& wellContribs, \
     InverseOperatorResult &res);
 
 template void BdaBridge::solve_system< \
@@ -231,6 +235,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock
 Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
     Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
+    WellContributions& wellContribs, \
     InverseOperatorResult &res);
 
 template void BdaBridge::get_result< \
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index d1ae954e8..a0f318e6e 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -26,6 +26,8 @@
 #include "dune/istl/bcrsmatrix.hh"
 #include <opm/simulators/linalg/matrixblock.hh>
 
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
 #if HAVE_CUDA
 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
 #endif
@@ -55,11 +57,12 @@ public:
 
 
     /// Solve linear system, A*x = b
-    /// \param[in] mat     matrix A, should be of type Dune::BCRSMatrix
-    /// \param[in] b       vector b, should be of type Dune::BlockVector
-    /// \param[in] result  summary of solver result
+    /// \param[in] mat          matrix A, should be of type Dune::BCRSMatrix
+    /// \param[in] b            vector b, should be of type Dune::BlockVector
+    /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] result    summary of solver result
     template <class BridgeMatrix, class BridgeVector>
-    void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
+    void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
 
     /// Get the resulting x vector
     /// \param[inout] x    vector x, should be of type Dune::BlockVector
diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu
new file mode 100644
index 000000000..355860952
--- /dev/null
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@@ -0,0 +1,420 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <cstdlib>
+#include <cstring>
+#include <config.h> // CMake
+
+#ifdef __NVCC__
+#include "opm/simulators/linalg/bda/cuda_header.hpp"
+#include <cuda_runtime.h>
+#endif
+
+#include "opm/simulators/linalg/bda/WellContributions.hpp"
+
+namespace Opm
+{
+
+    // apply WellContributions using y -= C^T * (D^-1 * (B * x))
+#if HAVE_CUDA
+    __global__ void apply_well_contributions( \
+        const double * __restrict__ Cnnzs, \
+        const double * __restrict__ Dnnzs, \
+        const double * __restrict__ Bnnzs, \
+        const int * __restrict__ Ccols, \
+        const int * __restrict__ Bcols, \
+        const double * __restrict__ x, \
+        double * __restrict__ y, \
+        const int dim, \
+        const int dim_wells, \
+        const unsigned int * __restrict__ val_pointers \
+        )
+    {
+        const int idx_b = blockIdx.x;
+        const int idx_t = threadIdx.x;
+        int idx = idx_b * blockDim.x + idx_t;
+        const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
+
+        const int vals_per_block = dim * dim_wells;        // 12
+        const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
+        const int num_blocks_per_warp = 32/vals_per_block; // 2
+        const int lane = idx_t % 32;
+        const int c = lane % dim;                           // col in block
+        const int r = (lane / dim) % dim_wells;             // row in block
+        const int NUM_THREADS = gridDim.x * blockDim.x;
+
+        extern __shared__ double smem[];
+        double * __restrict__ z1 = smem;
+        double * __restrict__ z2 = z1 + dim_wells;
+
+        if(idx_t < dim_wells){
+            z1[idx_t] = 0.0;
+        }
+
+        __syncthreads();
+
+        // z1 = B * x
+        if(idx_t < num_active_threads){
+            // multiply all blocks with x
+            double temp = 0.0;
+            int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
+            while(b < val_size + val_pointers[idx_b]){
+                int colIdx = Bcols[b];
+                temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
+                b += num_blocks_per_warp;
+            }
+
+            // merge all blocks into 1 dim*dim_wells block
+            // since NORNE has only 2 parallel blocks, do not use a loop
+            temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
+
+            b = idx_t / vals_per_block + val_pointers[idx_b];
+
+            // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
+            if(idx_t < vals_per_block){
+                // should be a loop as well, now only works for dim == 3
+                if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);}  // add col 2 to col 0
+                if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);}  // add col 1 to col 0
+            }
+
+            // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
+            if(c == 0 && idx_t < vals_per_block){
+                z1[r] = temp;
+            }
+        }
+
+        __syncthreads();
+
+        // z2 = D^-1 * B * x = D^-1 * z1
+        if(idx_t < dim_wells){
+            double temp = 0.0;
+            for(int c = 0; c < dim_wells; ++c){
+                temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
+            }
+            z2[idx_t] = temp;
+        }
+
+        __syncthreads();
+
+        // y -= C^T * D^-1 * B * x
+        // use dim * val_size threads, each block is assigned 'dim' threads
+        if(idx_t < dim * val_size){
+            double temp = 0.0;
+            int b = idx_t / dim + val_pointers[idx_b];
+            int cc = idx_t % dim;
+            int colIdx = Ccols[b];
+            for(unsigned int c = 0; c < dim_wells; ++c){
+                temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
+            }
+            y[colIdx * dim + cc] -= temp;
+        }
+
+    }
+#endif
+
+        void WellContributions::alloc_all(){
+#if HAVE_CUDA
+            if(gpu_mode){
+                alloc_gpu();
+            }else{
+                alloc_cpu();
+            }
+#else
+            alloc_cpu();
+#endif
+            allocated = true;
+        }
+
+        void WellContributions::alloc_cpu(){
+            Cnnzs = new double[num_blocks * dim * dim_wells];
+            Dnnzs = new double[num_wells * dim_wells * dim_wells];
+            Bnnzs = new double[num_blocks * dim * dim_wells];
+            Ccols = new int[num_blocks];
+            Bcols = new int[num_blocks];
+            val_pointers = new unsigned int[num_wells + 1];
+            z1 = new double[dim_wells];    //        B * x
+            z2 = new double[dim_wells];    // D^-1 * B * x
+        }
+
+#if HAVE_CUDA
+        void WellContributions::alloc_gpu(){
+            cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+            cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
+            cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+            cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+            cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+            val_pointers = new unsigned int[num_wells + 1];
+            cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
+            cudaCheckLastError("apply_gpu malloc failed");
+        }
+#endif
+
+        WellContributions::~WellContributions()
+        {
+#if HAVE_CUDA
+            if(gpu_mode){
+                free_gpu();
+            }else{
+                free_cpu();
+            }
+#else
+            free_cpu();
+#endif
+        }
+
+        void WellContributions::free_cpu(){
+            delete[] Cnnzs;
+            delete[] Dnnzs;
+            delete[] Bnnzs;
+            delete[] Ccols;
+            delete[] Bcols;
+            delete[] val_pointers;
+            delete[] z1;
+            delete[] z2;
+            //delete[] Mnnzs;
+        }
+
+#if HAVE_CUDA
+        void WellContributions::free_gpu(){
+            cudaFree(d_Cnnzs);
+            cudaFree(d_Dnnzs);
+            cudaFree(d_Bnnzs);
+            cudaFree(d_Ccols);
+            cudaFree(d_Bcols);
+            delete[] val_pointers;
+            cudaFree(d_val_pointers);
+            // cudaFree(d_z1);
+            // cudaFree(d_z2);
+        }
+#endif
+
+
+        void WellContributions::apply(double *x, double *y){
+#if HAVE_CUDA
+            if (gpu_mode){
+                apply_gpu(x, y);
+            }else{
+                apply_cpu(x, y);
+            }
+#else
+            apply_cpu(x, y);
+#endif
+        }
+
+        // Apply the WellContributions, similar to StandardWell::apply()
+        // y -= (C^T *(D^-1*(   B*x)))
+        void WellContributions::apply_cpu(double *x, double *y)
+        {
+#if 0
+            // Mnnzs contains a sparse matrix with a symmetric pattern
+            // Mrows would contain 'i*val_size' for every entry i, since every row has the same number of blocks
+            // Mcols are the same as Ccols, normally, there is an entry for every block, but since all rows have the same sparsity pattern, we only have to store 1 row
+            bool dbg = false;
+            for(int i = 0; i < dim*dim*val_size*val_size; ++i){
+                if(dbg)printf("Mnnzs[%d]: %.5e\n", i, Mnnzs[i]);
+            }
+            if(dbg)printf("row_size: %u, val_size: %u\n", row_size, val_size);
+            for(int r = 0; r < val_size; ++r){
+                for(int c = 0; c < val_size; ++c){
+                    int colIdx = Ccols[c];
+                    if(dbg)printf("colIdx: %d\n", colIdx);
+                    for(int i = 0; i < dim; ++i){
+                        double sum = 0.0;
+                        for(int j = 0; j < dim; ++j){
+                            sum += Mnnzs[r * dim * dim * val_size + c * dim * dim + i * dim + j] * x[colIdx * dim + j];
+                        }
+                        if(dbg)printf("sum: %f\n", sum);
+                        y[colIdx * dim + i] -= sum;
+                    }
+                }
+            }
+            if(dbg)exit(0);
+#else
+            for(int wellID = 0; wellID < num_wells; ++wellID){
+                unsigned int val_size = val_pointers[wellID+1] - val_pointers[wellID];
+
+                // B * x
+                for (unsigned int i = 0; i < dim_wells; ++i) {
+                    z1[i] = 0.0;
+                }
+                for (unsigned int i = 0; i < val_size; ++i) {
+                    unsigned int blockID = i + val_pointers[wellID];
+                    int colIdx = Bcols[blockID];
+                    for (unsigned int j = 0; j < dim_wells; ++j) {
+                        double temp = 0.0;
+                        for (unsigned int k = 0; k < dim; ++k) {
+                            temp += Bnnzs[blockID * dim * dim_wells + j * dim + k] * x[colIdx * dim + k];
+                        }
+                        z1[j] += temp;
+                    }
+                }
+
+                // D^-1 * B * x
+                for (unsigned int i = 0; i < dim_wells; ++i) {
+                    z2[i] = 0.0;
+                }
+                for (unsigned int j = 0; j < dim_wells; ++j) {
+                    double temp = 0.0;
+                    for (unsigned int k = 0; k < dim_wells; ++k) {
+                        temp += Dnnzs[wellID * dim_wells * dim_wells + j * dim_wells + k] * z1[k];
+                    }
+                    z2[j] += temp;
+                }
+
+                // C^T * D^-1 * B * x
+                for (unsigned int i = 0; i < val_size; ++i) {
+                    unsigned int blockID = i + val_pointers[wellID];
+                    int colIdx = Ccols[blockID];
+                    for (unsigned int j = 0; j < dim; ++j) {
+                        double temp = 0.0;
+                        for (unsigned int k = 0; k < dim_wells; ++k) {
+                            temp += Cnnzs[blockID * dim * dim_wells + j + k * dim] * z2[k];
+                        }
+                        y[colIdx * dim + j] -= temp;
+                    }
+                }
+            }
+#endif
+        }
+
+
+        // Apply the WellContributions, similar to StandardWell::apply()
+        // y -= (C^T *(D^-1*(   B*x)))
+#if HAVE_CUDA
+        void WellContributions::apply_gpu(double *d_x, double *d_y)
+        {
+            int smem_size = 2 * sizeof(double) * dim_wells;
+            apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+        }
+#endif
+
+        void WellContributions::addMatrix(int idx, int *colIndices, double *values, unsigned int val_size)
+        {
+#if HAVE_CUDA
+            if(gpu_mode){
+                addMatrix_gpu(idx, colIndices, values, val_size);
+            }else{
+                addMatrix_cpu(idx, colIndices, values, val_size);
+            }
+#else
+            addMatrix_cpu(idx, colIndices, values, val_size);
+#endif
+            if(idx == 2){
+                num_blocks_so_far += val_size;
+            }
+            if(idx == 2){
+                num_wells_so_far++;
+            }
+        }
+
+
+        void WellContributions::addMatrix_cpu(int idx, int *colIndices, double *values, unsigned int val_size)
+        {
+            switch (idx) {
+            case 0:
+                memcpy(Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells);
+                memcpy(Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size);
+                break;
+            case 1:
+                memcpy(Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells);
+                break;
+            case 2:
+                memcpy(Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells);
+                memcpy(Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size);
+                val_pointers[num_wells_so_far] = num_blocks_so_far;
+                if(num_wells_so_far == num_wells - 1){
+                    val_pointers[num_wells] = num_blocks;
+                }
+                break;
+            case 3:
+                // store (C*D*B)
+                printf("ERROR unsupported matrix ID for WellContributions::addMatrix()\n");
+                exit(1);
+                // memcpy(Mnnzs, values, sizeof(double) * dim * dim * val_size * val_size);
+                // memcpy(Ccols, colIndices, sizeof(int) * val_size);
+                break;
+            default:
+                printf("ERROR unknown matrix ID for WellContributions::addMatrix()\n");
+                exit(1);
+            }
+        }
+
+
+#if HAVE_CUDA
+        void WellContributions::addMatrix_gpu(int idx, int *colIndices, double *values, unsigned int val_size)
+        {
+            switch (idx) {
+            case 0:
+                cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
+                cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+                break;
+            case 1:
+                cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
+                break;
+            case 2:
+                cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
+                cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+                val_pointers[num_wells_so_far] = num_blocks_so_far;
+                if(num_wells_so_far == num_wells - 1){
+                    val_pointers[num_wells] = num_blocks;
+                }
+                cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
+                break;
+            case 3:
+                // store (C*D*B)
+                printf("ERROR unsupported matrix ID for WellContributions::addMatrix()\n");
+                exit(1);
+                break;
+            default:
+                printf("ERROR unknown matrix ID for WellContributions::addMatrix()\n");
+                exit(1);
+            }
+            cudaCheckLastError("WellContributions::addMatrix() failed");
+        }
+
+        void WellContributions::setCudaStream(cudaStream_t stream_)
+        {
+            this->stream = stream_;
+        }
+
+#endif
+
+        void WellContributions::addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq)
+        {
+            if(allocated){
+                std::cerr << "Error cannot add more sizes after allocated in WellContributions" << std::endl;
+                exit(1);
+            }
+            num_blocks += nnz;
+            dim = numEq;
+            dim_wells = numWellEq;
+            num_wells++;
+        }
+
+        // Default value
+        bool WellContributions::gpu_mode = false;
+
+        // If HAVE_CUDA is false, use_gpu must be false too
+        void WellContributions::setMode(bool use_gpu){
+            gpu_mode = use_gpu;
+        }
+
+} //namespace Opm
+
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
new file mode 100644
index 000000000..07773971b
--- /dev/null
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -0,0 +1,136 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WellContributions_H
+#define WellContributions_H
+
+#include <vector>
+
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+#include <config.h>
+
+namespace Opm
+{
+
+    /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
+    /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
+    class WellContributions
+    {
+
+    private:
+        static bool gpu_mode;     // gpu_mode should be initialized in the ISTLSolverEbos constructor, and its value must not change afterwards
+        unsigned int num_blocks = 0;    // total number of blocks in all wells
+        unsigned int dim;
+        unsigned int dim_wells;
+        unsigned int num_wells = 0;
+        unsigned int num_blocks_so_far = 0;
+        unsigned int num_wells_so_far = 0;
+        unsigned int *val_pointers = nullptr;     // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
+        bool allocated = false;
+
+#if HAVE_CUDA
+        double *d_Cnnzs = nullptr;
+        double *d_Dnnzs = nullptr;
+        double *d_Bnnzs = nullptr;
+        int *d_Ccols = nullptr;
+        int *d_Bcols = nullptr;
+        double *d_z1 = nullptr;
+        double *d_z2 = nullptr;
+        unsigned int *d_val_pointers = nullptr;
+        cudaStream_t stream;
+#endif
+
+        double *Cnnzs = nullptr;
+        double *Dnnzs = nullptr;
+        double *Bnnzs = nullptr;
+        int *Ccols = nullptr;
+        int *Dcols = nullptr;
+        int *Bcols = nullptr;
+        double *z1 = nullptr;                // z1 = B * x
+        double *z2 = nullptr;                // z2 = D^-1 * B * x
+
+        /// Apply the WellContributions on CPU
+        void apply_cpu(double *x, double *y);
+
+        /// Allocate memory on the CPU
+        void alloc_cpu();
+
+        /// Free memory on the CPU
+        void free_cpu();
+
+        /// Same as addMatrix(), stores matrix on CPU
+        void addMatrix_cpu(int idx, int *colIndices, double *values, unsigned int val_size);
+
+#if HAVE_CUDA
+        /// Apply all wellcontributions on GPU, performs y -= C^T * (D^-1 * (B * x))
+        /// Kernel is asynchronous
+        void apply_gpu(double *d_x, double *d_y);
+
+        /// Allocate memory on the GPU
+        void alloc_gpu();
+
+        /// Free memory on the GPU
+        void free_gpu();
+
+        /// Same as addMatrix(), stores matrix on GPU
+        void addMatrix_gpu(int idx, int *colIndices, double *values, unsigned int val_size);
+#endif
+
+    public:
+#if HAVE_CUDA
+        /// Set a cudaStream to be used
+        /// \param[in] stream           the cudaStream that is used to launch the kernel in
+        void setCudaStream(cudaStream_t stream);
+#endif
+
+        /// Create a new WellContributions, implementation is empty
+        WellContributions(){};
+
+        /// Destroy a WellContributions, and free memory
+        ~WellContributions();
+
+        /// Apply all wellcontributions in this object
+        void apply(double *x, double *y);
+
+        /// Allocate memory for the wellcontributions
+        void alloc_all();
+
+        /// Indicate how large the next wellcontributions are, this function cannot be called after alloc_all() is called
+        void addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq);
+
+        /// Store a matrix in this object, in blocked csr format
+        void addMatrix(int idx, int *colIndices, double *values, unsigned int val_size);
+
+        /// Return the number of wells added to this object
+        unsigned int get_num_wells(){
+            return num_wells;
+        }
+
+        /// WellContributions can be applied on CPU or GPU
+        /// This function sets the static variable, so each WellContributions is applied on the correct hardware
+        static void setMode(bool use_gpu);
+
+    };
+
+} //namespace Opm
+
+#endif
diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
index 547840784..9425d1136 100644
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@@ -38,6 +38,10 @@
 #include "cusparse_v2.h"
 // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
 
+// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
+// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
+#define COPY_ROW_BY_ROW 0
+
 namespace Opm
 {
 
@@ -58,7 +62,7 @@ namespace Opm
         finalize();
     }
 
-    void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) {
+    void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
         double t_total1, t_total2;
         int n = N;
         double rho = 1.0, rhop;
@@ -72,7 +76,11 @@ namespace Opm
 
         t_total1 = second();
 
-        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
+        if(wellContribs.get_num_wells() > 0){
+            wellContribs.setCudaStream(stream);
+        }
+
+        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
 
         cublasDscal(cublasHandle, n, &mone, d_r, 1);
         cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
@@ -101,15 +109,20 @@ namespace Opm
             // apply ilu0
             cusparseDbsrsv2_solve(cusparseHandle, order, \
                 operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
+                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
             cusparseDbsrsv2_solve(cusparseHandle, order, \
                 operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
+                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
 
             // spmv
             cusparseDbsrmv(cusparseHandle, order, \
                 operation, Nb, Nb, nnzb, \
-                &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
+                &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
+
+            // apply wellContributions
+            if(wellContribs.get_num_wells() > 0){
+                wellContribs.apply(d_pw, d_v);
+            }
 
             cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
             alpha = rho / tmp1;
@@ -127,15 +140,20 @@ namespace Opm
             // apply ilu0
             cusparseDbsrsv2_solve(cusparseHandle, order, \
                 operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
+                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
             cusparseDbsrsv2_solve(cusparseHandle, order, \
                 operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
+                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
 
             // spmv
             cusparseDbsrmv(cusparseHandle, order, \
                 operation, Nb, Nb, nnzb, &one, descr_M, \
-                d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
+                d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
+
+            // apply wellContributions
+            if(wellContribs.get_num_wells() > 0){
+                wellContribs.apply(d_s, d_t);
+            }
 
             cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
             cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
@@ -178,8 +196,8 @@ namespace Opm
     void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
         this->N = N;
         this->nnz = nnz;
-        this->BLOCK_SIZE = dim;
-        this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
+        this->block_size = dim;
+        this->nnzb = nnz/block_size/block_size;
         Nb = (N + dim - 1) / dim;
         std::ostringstream out;
         out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
@@ -229,8 +247,10 @@ namespace Opm
         cusparseSetStream(cusparseHandle, stream);
         cudaCheckLastError("Could not set stream to cusparse");
 
-        cudaMallocHost((void**)&x, sizeof(double) * N);
-        cudaCheckLastError("Could not allocate pinned host memory");
+#if COPY_ROW_BY_ROW
+        cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+        cudaCheckLastError("Could not allocate pinned memory");
+#endif
 
         initialized = true;
     } // end initialize()
@@ -259,11 +279,10 @@ namespace Opm
         cusparseDestroyMatDescr(descr_U);
         cusparseDestroy(cusparseHandle);
         cublasDestroy(cublasHandle);
-        cudaHostUnregister(vals);
-        cudaHostUnregister(cols);
-        cudaHostUnregister(rows);
+#if COPY_ROW_BY_ROW
+        cudaFreeHost(vals_contiguous);
+#endif
         cudaStreamDestroy(stream);
-        cudaFreeHost(x);
     } // end finalize()
 
 
@@ -274,22 +293,23 @@ namespace Opm
             t1 = second();
         }
 
-        // information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
-        // possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
-        cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
-        cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
-        cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
-        cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
+#if COPY_ROW_BY_ROW
+        int sum = 0;
+        for(int i = 0; i < Nb; ++i){
+            int size_row = rows[i+1] - rows[i];
+            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+            sum += size_row * block_size * block_size;
+        }
+        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#else
         cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#endif
+
         cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
         cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
         cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
         cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
 
-        this->vals = vals;
-        this->cols = cols;
-        this->rows = rows;
-
         if (verbosity > 2) {
             cudaStreamSynchronize(stream);
             t2 = second();
@@ -301,14 +321,25 @@ namespace Opm
 
 
     // don't copy rowpointers and colindices, they stay the same
-    void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) {
+    void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
 
         double t1, t2;
         if (verbosity > 2) {
             t1 = second();
         }
 
+#if COPY_ROW_BY_ROW
+        int sum = 0;
+        for(int i = 0; i < Nb; ++i){
+            int size_row = rows[i+1] - rows[i];
+            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+            sum += size_row * block_size * block_size;
+        }
+        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#else
         cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#endif
+
         cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
         cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
 
@@ -364,11 +395,11 @@ namespace Opm
         cudaCheckLastError("Could not create analysis info");
 
         cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-            descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
+            descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
         cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
+            descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
         cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
+            descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
         cudaCheckLastError();
         d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
         
@@ -377,7 +408,7 @@ namespace Opm
         // analysis of ilu LU decomposition
         cusparseDbsrilu02_analysis(cusparseHandle, order, \
             Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
-            BLOCK_SIZE, info_M, policy, d_buffer);
+            block_size, info_M, policy, d_buffer);
 
         int structural_zero;
         cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
@@ -388,11 +419,11 @@ namespace Opm
         // analysis of ilu apply
         cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
             Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
-            BLOCK_SIZE, info_L, policy, d_buffer);
+            block_size, info_L, policy, d_buffer);
 
         cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
             Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
-            BLOCK_SIZE, info_U, policy, d_buffer);
+            block_size, info_U, policy, d_buffer);
         cudaCheckLastError("Could not analyse level information");
 
         if (verbosity > 2) {
@@ -403,6 +434,8 @@ namespace Opm
             OpmLog::info(out.str());
         }
 
+        analysis_done = true;
+
         return true;
     } // end analyse_matrix()
 
@@ -417,7 +450,8 @@ namespace Opm
         d_mRows = d_bRows;
         cusparseDbsrilu02(cusparseHandle, order, \
             Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
-            BLOCK_SIZE, info_M, policy, d_buffer);
+            block_size, info_M, policy, d_buffer);
+        cudaCheckLastError("Could not perform ilu decomposition");
 
         int structural_zero;
         // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
@@ -437,9 +471,9 @@ namespace Opm
     } // end create_preconditioner()
 
 
-    void cusparseSolverBackend::solve_system(BdaResult &res) {
+    void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
         // actually solve
-        gpu_pbicgstab(res);
+        gpu_pbicgstab(wellContribs, res);
         cudaStreamSynchronize(stream);
         cudaCheckLastError("Something went wrong during the GPU solve");
     } // end solve_system()
@@ -449,10 +483,6 @@ namespace Opm
     // caller must be sure that x is a valid array
     void cusparseSolverBackend::post_process(double *x) {
 
-        if (!initialized) {
-            cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
-        }
-
         double t1, t2;
         if (verbosity > 2) {
             t1 = second();
@@ -472,12 +502,12 @@ namespace Opm
 
     typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
 
-    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) {       
+    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { 
         if (initialized == false) {
             initialize(N, nnz, dim);
             copy_system_to_gpu(vals, rows, cols, b);
         }else{
-            update_system_on_gpu(vals, b);
+            update_system_on_gpu(vals, rows, b);
         }
         if (analysis_done == false) {
             if (!analyse_matrix()) {
@@ -486,7 +516,7 @@ namespace Opm
         }
         reset_prec_on_gpu();
         if (create_preconditioner()) {
-            solve_system(res);
+            solve_system(wellContribs, res);
         }else{
             return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
index e8d88ca29..b8648ef15 100644
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@@ -25,6 +25,7 @@
 #include "cusparse_v2.h"
 
 #include "opm/simulators/linalg/bda/BdaResult.hpp"
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
 
 namespace Opm
 {
@@ -50,13 +51,11 @@ private:
     int *d_bRows, *d_mRows;
     double *d_x, *d_b, *d_r, *d_rw, *d_p;
     double *d_pw, *d_s, *d_t, *d_v;
-    double *vals;
-    int *cols, *rows;
-    double *x, *b;
     void *d_buffer;
     int N, Nb, nnz, nnzb;
+    double *vals_contiguous;
 
-    int BLOCK_SIZE;
+    int block_size;
 
     bool initialized = false;
     bool analysis_done = false;
@@ -70,8 +69,9 @@ private:
     int verbosity = 0;
 
     /// Solve linear system using ilu0-bicgstab
-    /// \param[inout] res     summary of solver result
-    void gpu_pbicgstab(BdaResult& res);
+    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
     /// \param[in] N         number of nonzeroes, divide by dim*dim to get number of blocks
@@ -83,16 +83,17 @@ private:
     void finalize();
 
     /// Copy linear system to GPU
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
     /// \param[in] rows        array of rowPointers, contains N/dim+1 values
     /// \param[in] cols        array of columnIndices, contains nnz values
     /// \param[in] b           input vector, contains N values
     void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
 
     // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
+    /// \param[in] rows        array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
     /// \param[in] b           input vector, contains N values
-    void update_system_on_gpu(double *vals, double *b);
+    void update_system_on_gpu(double *vals, int *rows, double *b);
 
     /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
     void reset_prec_on_gpu();
@@ -106,8 +107,9 @@ private:
     bool create_preconditioner();
 
     /// Solve linear system
-    /// \param[inout] res     summary of solver result
-    void solve_system(BdaResult &res);
+    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    void solve_system(WellContributions& wellContribs, BdaResult &res);
 
 public:
 
@@ -128,16 +130,17 @@ public:
     ~cusparseSolverBackend();
 
     /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N           number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz         number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim         size of block
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols        array of columnIndices, contains nnz values
-    /// \param[in] b           input vector, contains N values
-    /// \param[inout] res      summary of solver result
-    /// \return                status code
-    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res);
+    /// \param[in] N              number of rows, divide by dim to get number of blockrows
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    /// \return                   status code
+    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
 
     /// Post processing after linear solve, now only copies resulting x vector back
     /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
diff --git a/opm/simulators/wells/BlackoilWellModel.hpp b/opm/simulators/wells/BlackoilWellModel.hpp
index e559aba5d..a5c9e817d 100644
--- a/opm/simulators/wells/BlackoilWellModel.hpp
+++ b/opm/simulators/wells/BlackoilWellModel.hpp
@@ -192,6 +192,9 @@ namespace Opm {
             // subtract B*inv(D)*C * x from A*x
             void apply(const BVector& x, BVector& Ax) const;
 
+            // accumulate the contributions of all Wells in the WellContributions object
+            void getWellContributions(WellContributions& x) const;
+
             // apply well model with scaling of alpha
             void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const;
 
diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp
index ad47b484f..8e7ccdacc 100644
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@@ -919,7 +919,25 @@ namespace Opm {
     }
 
 
-
+    template<typename TypeTag>
+    void
+    BlackoilWellModel<TypeTag>::
+    getWellContributions(WellContributions& wellContribs) const
+    {
+        for(unsigned int i = 0; i < well_container_.size(); i++){
+            auto& well = well_container_[i];
+            std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
+            unsigned int nnz, numEq, numWellEq;
+            derived->getWellSizes(nnz, numEq, numWellEq);
+            wellContribs.addSizes(nnz, numEq, numWellEq);
+        }
+        wellContribs.alloc_all();
+        for(unsigned int i = 0; i < well_container_.size(); i++){
+            auto& well = well_container_[i];
+            std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
+            derived->addWellContribution(wellContribs);
+        }
+    }
 
 
     // Ax = Ax - alpha * C D^-1 B x
diff --git a/opm/simulators/wells/StandardWell.hpp b/opm/simulators/wells/StandardWell.hpp
index 69a8a9629..5789fd7cd 100644
--- a/opm/simulators/wells/StandardWell.hpp
+++ b/opm/simulators/wells/StandardWell.hpp
@@ -184,6 +184,12 @@ namespace Opm
         /// r = r - C D^-1 Rw
         virtual void apply(BVector& r) const override;
 
+        /// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object
+        void addWellContribution(WellContributions& wellContribs) const;
+
+        /// get the sizes of the C, D^-1 and B matrices, used to allocate memory in a WellContributions object
+        void getWellSizes(unsigned int& _nnzs, unsigned int& _numEq, unsigned int& _numWellEq) const;
+
         /// using the solution x to recover the solution xw for wells and applying
         /// xw to update Well State
         virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,
diff --git a/opm/simulators/wells/StandardWell_impl.hpp b/opm/simulators/wells/StandardWell_impl.hpp
index c9bfabd34..7144d6429 100644
--- a/opm/simulators/wells/StandardWell_impl.hpp
+++ b/opm/simulators/wells/StandardWell_impl.hpp
@@ -2736,6 +2736,65 @@ namespace Opm
     }
 
 
+    template<typename TypeTag>
+    void
+    StandardWell<TypeTag>::
+    addWellContribution(WellContributions& wellContribs) const
+    {
+        std::vector<int> colIndices;
+        std::vector<double> nnzValues;
+        colIndices.reserve(duneB_.nonzeroes());
+        nnzValues.reserve(duneB_.nonzeroes()*numStaticWellEq * numEq);
+
+        // duneC
+        for ( auto colC = duneC_[0].begin(), endC = duneC_[0].end(); colC != endC; ++colC )
+        {
+            colIndices.emplace_back(colC.index());
+            for (int i = 0; i < numStaticWellEq; ++i) {
+                for (int j = 0; j < numEq; ++j) {
+                    nnzValues.emplace_back((*colC)[i][j]);
+                }
+            }
+        }
+        wellContribs.addMatrix(0, colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
+
+        // invDuneD
+        colIndices.clear();
+        nnzValues.clear();
+        colIndices.emplace_back(0);
+        for (int i = 0; i < numStaticWellEq; ++i)
+        {
+            for (int j = 0; j < numStaticWellEq; ++j) {
+                nnzValues.emplace_back(invDuneD_[0][0][i][j]);
+            }
+        }
+        wellContribs.addMatrix(1, colIndices.data(), nnzValues.data(), 1);
+
+        // duneB
+        colIndices.clear();
+        nnzValues.clear();
+        for ( auto colB = duneB_[0].begin(), endB = duneB_[0].end(); colB != endB; ++colB )
+        {
+            colIndices.emplace_back(colB.index());
+            for (int i = 0; i < numStaticWellEq; ++i) {
+                for (int j = 0; j < numEq; ++j) {
+                    nnzValues.emplace_back((*colB)[i][j]);
+                }
+            }
+        }
+        wellContribs.addMatrix(2, colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
+    }
+
+
+    template<typename TypeTag>
+    void
+    StandardWell<TypeTag>::
+    getWellSizes(unsigned int& _nnzs, unsigned int& _numEq, unsigned int& _numWellEq) const
+    {
+        _nnzs = duneB_.nonzeroes();
+        _numEq = numEq;
+        _numWellEq = numStaticWellEq;
+    }