Merge pull request #2475 from Tongdongq/separate_wellcontributions_for_gpu

Separate wellcontributions for gpu
2025-02-25 18:55:30 -06:00 · 2020-03-20 12:14:06 +01:00
parent 7775a665d0 ab2d9a8c76
commit c0a82ff335
13 changed files with 649 additions and 161 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,8 @@ if(NOT CMAKE_DISABLE_FIND_PACKAGE_CUDA AND
  # While the documentation says that it is deprecated, FindCUDA seems the
  # only easy way to determine the cublas and cusparse libraries.
  # Hence we call it unconditionally
-  find_package(CUDA)
+  # The WellContributions kernel uses __shfl_down_sync, which was introduced in CUDA 9.0
  find_package(CUDA 9.0)
 endif()
 if(CUDA_FOUND)
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -29,7 +29,6 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/flow/MissingFeatures.cpp
  opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
  opm/simulators/linalg/setupPropertyTree.cpp
  opm/simulators/linalg/bda/BdaBridge.cpp
  opm/simulators/timestepping/TimeStepControl.cpp
  opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp
  opm/simulators/timestepping/SimulatorTimer.cpp
@@ -44,6 +43,8 @@ list (APPEND MAIN_SOURCE_FILES
 if(CUDA_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
 if(MPI_FOUND)
  list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
@@ -142,6 +143,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/BdaResult.hpp
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
  opm/simulators/linalg/bda/WellContributions.hpp
  opm/simulators/linalg/BlackoilAmg.hpp
  opm/simulators/linalg/BlackoilAmgCpr.hpp
  opm/simulators/linalg/amgcpr.hh
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -45,7 +45,9 @@
 #include <opm/common/utility/platform_dependent/reenable_warnings.h>
 #if HAVE_CUDA
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #endif
 BEGIN_PROPERTIES
@@ -338,15 +340,16 @@ protected:
              converged_(false)
        {
            parameters_.template init<TypeTag>();
            const auto& gridForConn = simulator_.vanguard().grid();
 #if HAVE_CUDA
-            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
            if (gridForConn.comm().size() > 1 && use_gpu) {
                OpmLog::warning("Warning cannot use GPU with MPI, GPU is disabled");
                use_gpu = false;
            }
            const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
            const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
            const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
            const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
            if (use_gpu && !matrix_add_well_contributions) {
                OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
            }
            bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
 #else
            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
@@ -355,7 +358,6 @@ protected:
            }
 #endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
            const auto& gridForConn = simulator_.vanguard().grid();
            const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
            const bool useWellConn = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
@@ -569,22 +571,33 @@ protected:
 #endif
            {
                // tries to solve linear system
                // solve_system() does nothing if Dune is selected
 #if HAVE_CUDA
-                bdaBridge->solve_system(matrix_.get(), istlb, result);
+                bool use_gpu = bdaBridge->getUseGpu();
-
+                if (use_gpu) {
-                if (result.converged) {
+                    WellContributions wellContribs;
-                    // get result vector x from non-Dune backend, iff solve was successful
+                    const bool addWellContribs = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
-                    bdaBridge->get_result(x);
+                    if (!addWellContribs) {
-                } else {
+                        simulator_.problem().wellModel().getWellContributions(wellContribs);
                    // CPU fallback, or default case for Dune
                    const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
                    if (use_gpu) {
                        OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
                    }
                    bdaBridge->solve_system(matrix_.get(), istlb, wellContribs, result);
                    if (result.converged) {
                        // get result vector x from non-Dune backend, iff solve was successful
                        bdaBridge->get_result(x);
                    } else {
                        // CPU fallback
                        use_gpu = bdaBridge->getUseGpu();  // update value, BdaBridge might have disabled cusparseSolver
                        if (use_gpu) {
                            OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
                        }
                        // call Dune
                        auto precond = constructPrecond(linearOperator, parallelInformation_arg);
                        solve(linearOperator, x, istlb, *sp, *precond, result);
                    }
                } else { // gpu is not selected or disabled
                    auto precond = constructPrecond(linearOperator, parallelInformation_arg);
                    solve(linearOperator, x, istlb, *sp, *precond, result);
-                } // end Dune call
+                }
 #else
                // Construct preconditioner.
                auto precond = constructPrecond(linearOperator, parallelInformation_arg);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -35,7 +35,6 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
 namespace Opm
 {
 #if HAVE_CUDA
 BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
    : use_gpu(use_gpu_)
 {
@@ -43,15 +42,9 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, doub
        backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
    }
 }
 #else
 BdaBridge::BdaBridge(bool use_gpu_ OPM_UNUSED, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED)
 {
 }
 #endif
 #if HAVE_CUDA
 template <class BridgeMatrix>
 int checkZeroDiagonal(BridgeMatrix& mat) {
    static std::vector<typename BridgeMatrix::size_type> diag_indices;   // contains offsets of the diagonal nnzs
@@ -118,13 +111,11 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector
    }
 } // end getSparsityPattern()
 #endif
 template <class BridgeMatrix, class BridgeVector>
-void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
+void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
 {
 #if HAVE_CUDA
    if (use_gpu) {
        BdaResult result;
        result.converged = false;
@@ -137,6 +128,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
        if (dim != 3) {
            OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
            use_gpu = false;
            return;
        }
        if (h_rows.capacity() == 0) {
@@ -169,7 +161,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
        typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
-        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
+        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
        switch(status) {
        case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
            //OpmLog::info("cusparseSolver converged");
@@ -192,61 +184,62 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
    }else{
        res.converged = false;
    }
 #endif // HAVE_CUDA
 }
 template <class BridgeVector>
 void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
 #if HAVE_CUDA
    if (use_gpu) {
        backend->post_process(static_cast<double*>(&(x[0][0])));
    }
 #endif
 }
-template void BdaBridge::solve_system< \
+template void BdaBridge::solve_system<
-Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > , \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > ,
-Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > >
-(Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat, \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat,
-    Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b, \
+    Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b,
    WellContributions& wellContribs,
    InverseOperatorResult &res);
-template void BdaBridge::solve_system< \
+template void BdaBridge::solve_system<
-Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > ,
-Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > >
-(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat,
-    Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
+    Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b,
    WellContributions& wellContribs,
    InverseOperatorResult &res);
-template void BdaBridge::solve_system< \
+template void BdaBridge::solve_system<
-Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > ,
-Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > >
-(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat,
-    Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
+    Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b,
    WellContributions& wellContribs,
    InverseOperatorResult &res);
-template void BdaBridge::solve_system< \
+template void BdaBridge::solve_system<
-Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > ,
-Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > >
-(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat,
-    Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
+    Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b,
    WellContributions& wellContribs,
    InverseOperatorResult &res);
-template void BdaBridge::get_result< \
+template void BdaBridge::get_result<
-Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > >
 (Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &x);
-template void BdaBridge::get_result< \
+template void BdaBridge::get_result<
-Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > >
 (Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
-template void BdaBridge::get_result< \
+template void BdaBridge::get_result<
-Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > >
 (Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
-template void BdaBridge::get_result< \
+template void BdaBridge::get_result<
-Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
+Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > >
 (Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -21,14 +21,19 @@
 #define BDABRIDGE_HEADER_INCLUDED
 #include <config.h>
 #if ! HAVE_CUDA
  #error "This file should only be included if CUDA is found"
 #endif
 #include "dune/istl/solver.hh" // for struct InverseOperatorResult
 #include "dune/istl/bcrsmatrix.hh"
 #include <opm/simulators/linalg/matrixblock.hh>
-#if HAVE_CUDA
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
 #endif
 namespace Opm
 {
@@ -40,10 +45,8 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
 class BdaBridge
 {
 private:
 #if HAVE_CUDA
    std::unique_ptr<cusparseSolverBackend> backend;
    bool use_gpu;
 #endif
 public:
    /// Construct a BdaBridge
@@ -55,17 +58,24 @@ public:
    /// Solve linear system, A*x = b
-    /// \param[in] mat     matrix A, should be of type Dune::BCRSMatrix
+    /// \param[in] mat          matrix A, should be of type Dune::BCRSMatrix
-    /// \param[in] b       vector b, should be of type Dune::BlockVector
+    /// \param[in] b            vector b, should be of type Dune::BlockVector
-    /// \param[in] result  summary of solver result
+    /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] result    summary of solver result
    template <class BridgeMatrix, class BridgeVector>
-    void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
+    void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
    /// Get the resulting x vector
    /// \param[inout] x    vector x, should be of type Dune::BlockVector
    template <class BridgeVector>
    void get_result(BridgeVector &x);
    /// Return whether the BdaBridge will use the GPU or not
    /// return whether the BdaBridge will use the GPU or not
    bool getUseGpu(){
        return use_gpu;
    }
 }; // end class BdaBridge
 }
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@@ -0,0 +1,218 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <cstdlib>
 #include <cstring>
 #include <config.h> // CMake
 #ifdef __NVCC__
 #include "opm/simulators/linalg/bda/cuda_header.hpp"
 #include <cuda_runtime.h>
 #endif
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include "opm/simulators/linalg/bda/WellContributions.hpp"
 namespace Opm
 {
    // apply WellContributions using y -= C^T * (D^-1 * (B * x))
    __global__ void apply_well_contributions(
        const double * __restrict__ Cnnzs,
        const double * __restrict__ Dnnzs,
        const double * __restrict__ Bnnzs,
        const int * __restrict__ Ccols,
        const int * __restrict__ Bcols,
        const double * __restrict__ x,
        double * __restrict__ y,
        const int dim,
        const int dim_wells,
        const unsigned int * __restrict__ val_pointers
        )
    {
        const int idx_b = blockIdx.x;
        const int idx_t = threadIdx.x;
        const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
        const int vals_per_block = dim * dim_wells;        // 12
        const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
        const int num_blocks_per_warp = 32/vals_per_block; // 2
        const int lane = idx_t % 32;
        const int c = lane % dim;                           // col in block
        const int r = (lane / dim) % dim_wells;             // row in block
        extern __shared__ double smem[];
        double * __restrict__ z1 = smem;
        double * __restrict__ z2 = z1 + dim_wells;
        if(idx_t < dim_wells){
            z1[idx_t] = 0.0;
        }
        __syncthreads();
        // z1 = B * x
        if(idx_t < num_active_threads){
            // multiply all blocks with x
            double temp = 0.0;
            int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
            while(b < val_size + val_pointers[idx_b]){
                int colIdx = Bcols[b];
                temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
                b += num_blocks_per_warp;
            }
            // merge all blocks into 1 dim*dim_wells block
            // since NORNE has only 2 parallel blocks, do not use a loop
            temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
            b = idx_t / vals_per_block + val_pointers[idx_b];
            // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
            if(idx_t < vals_per_block){
                // should be a loop as well, now only works for dim == 3
                if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);}  // add col 2 to col 0
                if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);}  // add col 1 to col 0
            }
            // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
            if(c == 0 && idx_t < vals_per_block){
                z1[r] = temp;
            }
        }
        __syncthreads();
        // z2 = D^-1 * B * x = D^-1 * z1
        if(idx_t < dim_wells){
            double temp = 0.0;
            for(int c = 0; c < dim_wells; ++c){
                temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
            }
            z2[idx_t] = temp;
        }
        __syncthreads();
        // y -= C^T * D^-1 * B * x
        // use dim * val_size threads, each block is assigned 'dim' threads
        if(idx_t < dim * val_size){
            double temp = 0.0;
            int b = idx_t / dim + val_pointers[idx_b];
            int cc = idx_t % dim;
            int colIdx = Ccols[b];
            for(unsigned int c = 0; c < dim_wells; ++c){
                temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
            }
            y[colIdx * dim + cc] -= temp;
        }
    }
    void WellContributions::alloc(){
        cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
        cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
        cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
        cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
        cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
        val_pointers = new unsigned int[num_wells + 1];
        cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
        cudaCheckLastError("apply_gpu malloc failed");
        allocated = true;
    }
    WellContributions::~WellContributions()
    {
        cudaFree(d_Cnnzs);
        cudaFree(d_Dnnzs);
        cudaFree(d_Bnnzs);
        cudaFree(d_Ccols);
        cudaFree(d_Bcols);
        delete[] val_pointers;
        cudaFree(d_val_pointers);
    }
    // Apply the WellContributions, similar to StandardWell::apply()
    // y -= (C^T *(D^-1*(   B*x)))
    void WellContributions::apply(double *d_x, double *d_y)
    {
        int smem_size = 2 * sizeof(double) * dim_wells;
        apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
    }
    void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
    {
        if (!allocated) {
            OPM_THROW(std::logic_error,"Error cannot add wellcontribution before allocating memory in WellContributions");
        }
        switch (type) {
        case MatrixType::C:
            cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
            cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
            break;
        case MatrixType::D:
            cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
            break;
        case MatrixType::B:
            cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
            cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
            val_pointers[num_wells_so_far] = num_blocks_so_far;
            if(num_wells_so_far == num_wells - 1){
                val_pointers[num_wells] = num_blocks;
            }
            cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
            break;
        default:
            OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
        }
        cudaCheckLastError("WellContributions::addMatrix() failed");
        if (MatrixType::B == type) {
            num_blocks_so_far += val_size;
            num_wells_so_far++;
        }
    }
    void WellContributions::setCudaStream(cudaStream_t stream_)
    {
        this->stream = stream_;
    }
    void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
    {
        dim = dim_;
        dim_wells = dim_wells_;
    }
    void WellContributions::addNumBlocks(unsigned int nnz)
    {
        if (allocated) {
            OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
        }
        num_blocks += nnz;
        num_wells++;
    }
 } //namespace Opm
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -0,0 +1,123 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
 #define WELLCONTRIBUTIONS_HEADER_INCLUDED
 #include <config.h>
 #if ! HAVE_CUDA
  #error "This file should only be included if CUDA is found"
 #endif
 #include <vector>
 #include <cuda_runtime.h>
 namespace Opm
 {
    /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
    /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
    /// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
    /// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
    /// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
    /// B*x and D*B*x are a vector with numStaticWellEq doubles
    /// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
    ///
    /// This class is used in 3 phases:
    /// - get total size of all wellcontributions that must be stored here
    /// - allocate memory
    /// - copy data of wellcontributions
    class WellContributions
    {
    private:
        unsigned int num_blocks = 0;             // total number of blocks in all wells
        unsigned int dim;                        // number of columns of blocks in B and C, equal to StandardWell::numEq
        unsigned int dim_wells;                  // number of rows of blocks in B and C, equal to StandardWell::numStaticWellEq
        unsigned int num_wells = 0;              // number of wellcontributions in this object
        unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
        unsigned int num_wells_so_far = 0;       // keep track of where next data is written
        unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
        bool allocated = false;
        double *d_Cnnzs = nullptr;
        double *d_Dnnzs = nullptr;
        double *d_Bnnzs = nullptr;
        int *d_Ccols = nullptr;
        int *d_Bcols = nullptr;
        double *d_z1 = nullptr;
        double *d_z2 = nullptr;
        unsigned int *d_val_pointers = nullptr;
        cudaStream_t stream;
    public:
        /// StandardWell has C, D and B matrices that need to be copied
        enum class MatrixType {
            C,
            D,
            B
        };
        /// Set a cudaStream to be used
        /// \param[in] stream           the cudaStream that is used to launch the kernel in
        void setCudaStream(cudaStream_t stream);
        /// Create a new WellContributions, implementation is empty
        WellContributions(){};
        /// Destroy a WellContributions, and free memory
        ~WellContributions();
        /// Apply all wellcontributions in this object
        /// performs y -= (C^T * (D^-1 * (B*x))) for StandardWell
        /// \param[in] x          vector x
        /// \param[inout] y       vector y
        void apply(double *x, double *y);
        /// Allocate memory for the wellcontributions
        void alloc();
        /// Indicate how large the blocks of the wellcontributions (C and B) are
        /// \param[in] dim         number of columns
        /// \param[in] dim_wells   number of rows
        void setBlockSize(unsigned int dim, unsigned int dim_wells);
        /// Indicate how large the next wellcontribution is, this function cannot be called after alloc() is called
        /// \param[in] numBlocks   number of blocks in C and B of next wellcontribution
        void addNumBlocks(unsigned int numBlocks);
        /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
        /// \param[in] type        indicate if C, D or B is sent
        /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
        /// \param[in] values      array of nonzeroes
        /// \param[in] val_size    number of blocks in C or B, ignored for D
        void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
        /// Return the number of wells added to this object
        /// \return the number of wells added to this object
        unsigned int getNumWells(){
            return num_wells;
        }
    };
 } //namespace Opm
 #endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@@ -38,6 +38,10 @@
 #include "cusparse_v2.h"
 // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
 // iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0
 namespace Opm
 {
@@ -58,7 +62,7 @@ namespace Opm
        finalize();
    }
-    void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) {
+    void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
        double t_total1, t_total2;
        int n = N;
        double rho = 1.0, rhop;
@@ -72,7 +76,11 @@ namespace Opm
        t_total1 = second();
-        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
+        if(wellContribs.getNumWells() > 0){
            wellContribs.setCudaStream(stream);
        }
        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
        cublasDscal(cublasHandle, n, &mone, d_r, 1);
        cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
@@ -101,15 +109,20 @@ namespace Opm
            // apply ilu0
            cusparseDbsrsv2_solve(cusparseHandle, order, \
                operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
+                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
            cusparseDbsrsv2_solve(cusparseHandle, order, \
                operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
+                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
            // spmv
            cusparseDbsrmv(cusparseHandle, order, \
                operation, Nb, Nb, nnzb, \
-                &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
+                &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
            // apply wellContributions
            if(wellContribs.getNumWells() > 0){
                wellContribs.apply(d_pw, d_v);
            }
            cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
            alpha = rho / tmp1;
@@ -127,15 +140,20 @@ namespace Opm
            // apply ilu0
            cusparseDbsrsv2_solve(cusparseHandle, order, \
                operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
+                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
            cusparseDbsrsv2_solve(cusparseHandle, order, \
                operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
+                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
            // spmv
            cusparseDbsrmv(cusparseHandle, order, \
                operation, Nb, Nb, nnzb, &one, descr_M, \
-                d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
+                d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
            // apply wellContributions
            if(wellContribs.getNumWells() > 0){
                wellContribs.apply(d_s, d_t);
            }
            cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
            cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
@@ -178,8 +196,8 @@ namespace Opm
    void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
        this->N = N;
        this->nnz = nnz;
-        this->BLOCK_SIZE = dim;
+        this->block_size = dim;
-        this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
+        this->nnzb = nnz/block_size/block_size;
        Nb = (N + dim - 1) / dim;
        std::ostringstream out;
        out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
@@ -229,41 +247,44 @@ namespace Opm
        cusparseSetStream(cusparseHandle, stream);
        cudaCheckLastError("Could not set stream to cusparse");
-        cudaMallocHost((void**)&x, sizeof(double) * N);
+#if COPY_ROW_BY_ROW
-        cudaCheckLastError("Could not allocate pinned host memory");
+        cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
        cudaCheckLastError("Could not allocate pinned memory");
 #endif
        initialized = true;
    } // end initialize()
    void cusparseSolverBackend::finalize() {
-        cudaFree(d_x);
+        if (initialized) {
-        cudaFree(d_b);
+            cudaFree(d_x);
-        cudaFree(d_r);
+            cudaFree(d_b);
-        cudaFree(d_rw);
+            cudaFree(d_r);
-        cudaFree(d_p);
+            cudaFree(d_rw);
-        cudaFree(d_pw);
+            cudaFree(d_p);
-        cudaFree(d_s);
+            cudaFree(d_pw);
-        cudaFree(d_t);
+            cudaFree(d_s);
-        cudaFree(d_v);
+            cudaFree(d_t);
-        cudaFree(d_mVals);
+            cudaFree(d_v);
-        cudaFree(d_bVals);
+            cudaFree(d_mVals);
-        cudaFree(d_bCols);
+            cudaFree(d_bVals);
-        cudaFree(d_bRows);
+            cudaFree(d_bCols);
-        cudaFree(d_buffer);
+            cudaFree(d_bRows);
-        cusparseDestroyBsrilu02Info(info_M);
+            cudaFree(d_buffer);
-        cusparseDestroyBsrsv2Info(info_L);
+            cusparseDestroyBsrilu02Info(info_M);
-        cusparseDestroyBsrsv2Info(info_U);
+            cusparseDestroyBsrsv2Info(info_L);
-        cusparseDestroyMatDescr(descr_B);
+            cusparseDestroyBsrsv2Info(info_U);
-        cusparseDestroyMatDescr(descr_M);
+            cusparseDestroyMatDescr(descr_B);
-        cusparseDestroyMatDescr(descr_L);
+            cusparseDestroyMatDescr(descr_M);
-        cusparseDestroyMatDescr(descr_U);
+            cusparseDestroyMatDescr(descr_L);
-        cusparseDestroy(cusparseHandle);
+            cusparseDestroyMatDescr(descr_U);
-        cublasDestroy(cublasHandle);
+            cusparseDestroy(cusparseHandle);
-        cudaHostUnregister(vals);
+            cublasDestroy(cublasHandle);
-        cudaHostUnregister(cols);
+#if COPY_ROW_BY_ROW
-        cudaHostUnregister(rows);
+            cudaFreeHost(vals_contiguous);
-        cudaStreamDestroy(stream);
+#endif
-        cudaFreeHost(x);
+            cudaStreamDestroy(stream);
        }
    } // end finalize()
@@ -274,22 +295,23 @@ namespace Opm
            t1 = second();
        }
-        // information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
+#if COPY_ROW_BY_ROW
-        // possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
+        int sum = 0;
-        cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
+        for(int i = 0; i < Nb; ++i){
-        cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
+            int size_row = rows[i+1] - rows[i];
-        cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
+            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
-        cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
+            sum += size_row * block_size * block_size;
        }
        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #else
        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #endif
        cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
        this->vals = vals;
        this->cols = cols;
        this->rows = rows;
        if (verbosity > 2) {
            cudaStreamSynchronize(stream);
            t2 = second();
@@ -301,14 +323,25 @@ namespace Opm
    // don't copy rowpointers and colindices, they stay the same
-    void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) {
+    void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
        double t1, t2;
        if (verbosity > 2) {
            t1 = second();
        }
 #if COPY_ROW_BY_ROW
        int sum = 0;
        for(int i = 0; i < Nb; ++i){
            int size_row = rows[i+1] - rows[i];
            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
            sum += size_row * block_size * block_size;
        }
        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #else
        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
 #endif
        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
@@ -364,11 +397,11 @@ namespace Opm
        cudaCheckLastError("Could not create analysis info");
        cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-            descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
+            descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
+            descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
+            descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
        cudaCheckLastError();
        d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
@@ -377,7 +410,7 @@ namespace Opm
        // analysis of ilu LU decomposition
        cusparseDbsrilu02_analysis(cusparseHandle, order, \
            Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
-            BLOCK_SIZE, info_M, policy, d_buffer);
+            block_size, info_M, policy, d_buffer);
        int structural_zero;
        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
@@ -388,11 +421,11 @@ namespace Opm
        // analysis of ilu apply
        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
            Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
-            BLOCK_SIZE, info_L, policy, d_buffer);
+            block_size, info_L, policy, d_buffer);
        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
            Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
-            BLOCK_SIZE, info_U, policy, d_buffer);
+            block_size, info_U, policy, d_buffer);
        cudaCheckLastError("Could not analyse level information");
        if (verbosity > 2) {
@@ -403,6 +436,8 @@ namespace Opm
            OpmLog::info(out.str());
        }
        analysis_done = true;
        return true;
    } // end analyse_matrix()
@@ -417,7 +452,8 @@ namespace Opm
        d_mRows = d_bRows;
        cusparseDbsrilu02(cusparseHandle, order, \
            Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
-            BLOCK_SIZE, info_M, policy, d_buffer);
+            block_size, info_M, policy, d_buffer);
        cudaCheckLastError("Could not perform ilu decomposition");
        int structural_zero;
        // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
@@ -437,9 +473,9 @@ namespace Opm
    } // end create_preconditioner()
-    void cusparseSolverBackend::solve_system(BdaResult &res) {
+    void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
        // actually solve
-        gpu_pbicgstab(res);
+        gpu_pbicgstab(wellContribs, res);
        cudaStreamSynchronize(stream);
        cudaCheckLastError("Something went wrong during the GPU solve");
    } // end solve_system()
@@ -449,10 +485,6 @@ namespace Opm
    // caller must be sure that x is a valid array
    void cusparseSolverBackend::post_process(double *x) {
        if (!initialized) {
            cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
        }
        double t1, t2;
        if (verbosity > 2) {
            t1 = second();
@@ -472,12 +504,12 @@ namespace Opm
    typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
-    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) {       
+    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { 
        if (initialized == false) {
            initialize(N, nnz, dim);
            copy_system_to_gpu(vals, rows, cols, b);
        }else{
-            update_system_on_gpu(vals, b);
+            update_system_on_gpu(vals, rows, b);
        }
        if (analysis_done == false) {
            if (!analyse_matrix()) {
@@ -486,7 +518,7 @@ namespace Opm
        }
        reset_prec_on_gpu();
        if (create_preconditioner()) {
-            solve_system(res);
+            solve_system(wellContribs, res);
        }else{
            return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@@ -25,6 +25,7 @@
 #include "cusparse_v2.h"
 #include "opm/simulators/linalg/bda/BdaResult.hpp"
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 namespace Opm
 {
@@ -50,13 +51,11 @@ private:
    int *d_bRows, *d_mRows;
    double *d_x, *d_b, *d_r, *d_rw, *d_p;
    double *d_pw, *d_s, *d_t, *d_v;
    double *vals;
    int *cols, *rows;
    double *x, *b;
    void *d_buffer;
    int N, Nb, nnz, nnzb;
    double *vals_contiguous;
-    int BLOCK_SIZE;
+    int block_size;
    bool initialized = false;
    bool analysis_done = false;
@@ -70,8 +69,9 @@ private:
    int verbosity = 0;
    /// Solve linear system using ilu0-bicgstab
-    /// \param[inout] res     summary of solver result
+    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
-    void gpu_pbicgstab(BdaResult& res);
+    /// \param[inout] res         summary of solver result
    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
    /// Initialize GPU and allocate memory
    /// \param[in] N         number of nonzeroes, divide by dim*dim to get number of blocks
@@ -83,16 +83,17 @@ private:
    void finalize();
    /// Copy linear system to GPU
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
    /// \param[in] cols        array of columnIndices, contains nnz values
    /// \param[in] b           input vector, contains N values
    void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
    // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise, contains nnz values
    /// \param[in] rows        array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
    /// \param[in] b           input vector, contains N values
-    void update_system_on_gpu(double *vals, double *b);
+    void update_system_on_gpu(double *vals, int *rows, double *b);
    /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
    void reset_prec_on_gpu();
@@ -106,8 +107,9 @@ private:
    bool create_preconditioner();
    /// Solve linear system
-    /// \param[inout] res     summary of solver result
+    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
-    void solve_system(BdaResult &res);
+    /// \param[inout] res         summary of solver result
    void solve_system(WellContributions& wellContribs, BdaResult &res);
 public:
@@ -128,16 +130,17 @@ public:
    ~cusparseSolverBackend();
    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
-    /// \param[in] N           number of rows, divide by dim to get number of blockrows
+    /// \param[in] N              number of rows, divide by dim to get number of blockrows
-    /// \param[in] nnz         number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim         size of block
+    /// \param[in] dim            size of block
-    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
-    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    /// \param[in] cols        array of columnIndices, contains nnz values
+    /// \param[in] cols           array of columnIndices, contains nnz values
-    /// \param[in] b           input vector, contains N values
+    /// \param[in] b              input vector, contains N values
-    /// \param[inout] res      summary of solver result
+    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
-    /// \return                status code
+    /// \param[inout] res         summary of solver result
-    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res);
+    /// \return                   status code
    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
    /// Post processing after linear solve, now only copies resulting x vector back
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
--- a/opm/simulators/wells/BlackoilWellModel.hpp
+++ b/opm/simulators/wells/BlackoilWellModel.hpp
@@ -221,6 +221,11 @@ namespace Opm {
            // subtract B*inv(D)*C * x from A*x
            void apply(const BVector& x, BVector& Ax) const;
 #if HAVE_CUDA
            // accumulate the contributions of all Wells in the WellContributions object
            void getWellContributions(WellContributions& x) const;
 #endif
            // apply well model with scaling of alpha
            void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const;
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@@ -918,9 +918,32 @@ namespace Opm {
        }
    }
-
+#if HAVE_CUDA
-
+    template<typename TypeTag>
-
+    void
    BlackoilWellModel<TypeTag>::
    getWellContributions(WellContributions& wellContribs) const
    {
        wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
        for(unsigned int i = 0; i < well_container_.size(); i++){
            auto& well = well_container_[i];
            std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
            unsigned int numBlocks;
            derived->getNumBlocks(numBlocks);
            wellContribs.addNumBlocks(numBlocks);
        }
        wellContribs.alloc();
        for(unsigned int i = 0; i < well_container_.size(); i++){
            auto& well = well_container_[i];
            std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
            if (derived) {
                derived->addWellContribution(wellContribs);
            } else {
                OpmLog::warning("Warning only StandardWell is supported by WellContributions for GPU");
            }
        }
    }
 #endif
    // Ax = Ax - alpha * C D^-1 B x
    template<typename TypeTag>
--- a/opm/simulators/wells/StandardWell.hpp
+++ b/opm/simulators/wells/StandardWell.hpp
@@ -184,6 +184,14 @@ namespace Opm
        /// r = r - C D^-1 Rw
        virtual void apply(BVector& r) const override;
 #if HAVE_CUDA
        /// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object
        void addWellContribution(WellContributions& wellContribs) const;
        /// get the number of blocks of the C and B matrices, used to allocate memory in a WellContributions object
        void getNumBlocks(unsigned int& _nnzs) const;
 #endif
        /// using the solution x to recover the solution xw for wells and applying
        /// xw to update Well State
        virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,
--- a/opm/simulators/wells/StandardWell_impl.hpp
+++ b/opm/simulators/wells/StandardWell_impl.hpp
@@ -2735,8 +2735,65 @@ namespace Opm
        duneC_.mmtv(invDrw_, r);
    }
 #if HAVE_CUDA
    template<typename TypeTag>
    void
    StandardWell<TypeTag>::
    addWellContribution(WellContributions& wellContribs) const
    {
        std::vector<int> colIndices;
        std::vector<double> nnzValues;
        colIndices.reserve(duneB_.nonzeroes());
        nnzValues.reserve(duneB_.nonzeroes()*numStaticWellEq * numEq);
        // duneC
        for ( auto colC = duneC_[0].begin(), endC = duneC_[0].end(); colC != endC; ++colC )
        {
            colIndices.emplace_back(colC.index());
            for (int i = 0; i < numStaticWellEq; ++i) {
                for (int j = 0; j < numEq; ++j) {
                    nnzValues.emplace_back((*colC)[i][j]);
                }
            }
        }
        wellContribs.addMatrix(WellContributions::MatrixType::C, colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
        // invDuneD
        colIndices.clear();
        nnzValues.clear();
        colIndices.emplace_back(0);
        for (int i = 0; i < numStaticWellEq; ++i)
        {
            for (int j = 0; j < numStaticWellEq; ++j) {
                nnzValues.emplace_back(invDuneD_[0][0][i][j]);
            }
        }
        wellContribs.addMatrix(WellContributions::MatrixType::D, colIndices.data(), nnzValues.data(), 1);
        // duneB
        colIndices.clear();
        nnzValues.clear();
        for ( auto colB = duneB_[0].begin(), endB = duneB_[0].end(); colB != endB; ++colB )
        {
            colIndices.emplace_back(colB.index());
            for (int i = 0; i < numStaticWellEq; ++i) {
                for (int j = 0; j < numEq; ++j) {
                    nnzValues.emplace_back((*colB)[i][j]);
                }
            }
        }
        wellContribs.addMatrix(WellContributions::MatrixType::B, colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
    }
    template<typename TypeTag>
    void
    StandardWell<TypeTag>::
    getNumBlocks(unsigned int& numBlocks) const
    {
        numBlocks = duneB_.nonzeroes();
    }
 #endif
    template<typename TypeTag>