From 04ee2be3488ace54ab0c33cd4d26a1ce48b03d21 Mon Sep 17 00:00:00 2001
From: "T.D. (Tongdong) Qiu" <tqiu@ce-cuda01.ewi.tudelft.nl>
Date: Fri, 15 May 2020 16:00:09 +0200
Subject: [PATCH] cusparseSolver can now handle MultisegmentWells, they are
 actually applied on CPU via SuiteSparse/UMFPACK

---
 CMakeLists_files.cmake                        |   2 +
 .../bda/MultisegmentWellContribution.cpp      | 138 ++++++++++++++++++
 .../bda/MultisegmentWellContribution.hpp      | 114 +++++++++++++++
 .../linalg/bda/WellContributions.cu           |  86 +++++++----
 .../linalg/bda/WellContributions.hpp          |  55 +++++--
 opm/simulators/linalg/bda/cuda_header.hpp     |   1 +
 .../linalg/bda/cusparseSolverBackend.hpp      |   6 +-
 .../wells/BlackoilWellModel_impl.hpp          |  26 +++-
 opm/simulators/wells/MultisegmentWell.hpp     |   5 +
 .../wells/MultisegmentWell_impl.hpp           |  60 ++++++++
 10 files changed, 445 insertions(+), 48 deletions(-)
 create mode 100644 opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
 create mode 100644 opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index fc3fb8f2a..95993509a 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -44,6 +44,7 @@ list (APPEND MAIN_SOURCE_FILES
 if(CUDA_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
 if(MPI_FOUND)
@@ -144,6 +145,7 @@ list (APPEND PUBLIC_HEADER_FILES
   opm/simulators/linalg/bda/BdaResult.hpp
   opm/simulators/linalg/bda/cuda_header.hpp
   opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+  opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
   opm/simulators/linalg/bda/WellContributions.hpp
   opm/simulators/linalg/BlackoilAmg.hpp
   opm/simulators/linalg/amgcpr.hh
diff --git a/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp b/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
new file mode 100644
index 000000000..deacb033e
--- /dev/null
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
@@ -0,0 +1,138 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <cstdlib>
+#include <cstring>
+#include <config.h> // CMake
+#include <fstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#if HAVE_UMFPACK
+#include <dune/istl/umfpack.hh>
+#include <opm/simulators/linalg/bda/cuda_header.hpp>
+#endif // HAVE_UMFPACK
+
+#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
+
+namespace Opm
+{
+
+        MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
+            unsigned int Nb_, unsigned int Mb_,
+            unsigned int BnumBlocks_, double *Bvalues, unsigned int *BcolIndices, unsigned int *BrowPointers,
+            unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
+            double *Cvalues)
+            : dim(dim_), dim_wells(dim_wells_), N(Nb_*dim), Nb(Nb_), M(Mb_*dim_wells), Mb(Mb_), DnumBlocks(DnumBlocks_), BnumBlocks(BnumBlocks_)
+        {
+            Cvals.insert(Cvals.end(), Cvalues, Cvalues + BnumBlocks*dim*dim_wells);
+            Dvals.insert(Dvals.end(), Dvalues, Dvalues + DnumBlocks*dim_wells*dim_wells);
+            Bvals.insert(Bvals.end(), Bvalues, Bvalues + BnumBlocks*dim*dim_wells);
+            Ccols.insert(Ccols.end(), BcolIndices, BcolIndices + BnumBlocks);
+            Bcols.insert(Bcols.end(), BcolIndices, BcolIndices + BnumBlocks);
+            Crows.insert(Crows.end(), BrowPointers, BrowPointers + M + 1);
+            Brows.insert(Brows.end(), BrowPointers, BrowPointers + M + 1);
+
+            Dcols.insert(Dcols.end(), DcolPointers, DcolPointers + M + 1);
+            Drows.insert(Drows.end(), DrowIndices, DrowIndices + DnumBlocks*dim_wells*dim_wells);
+
+            z1.resize(Mb * dim_wells);
+            z2.resize(Mb * dim_wells);
+
+            // allocate pinned memory on host
+            cudaMallocHost(&h_x, sizeof(double) * N);
+            cudaMallocHost(&h_y, sizeof(double) * N);
+
+            umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
+            umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
+        }
+
+        MultisegmentWellContribution::~MultisegmentWellContribution()
+        {
+            cudaFreeHost(h_x);
+            cudaFreeHost(h_y);
+
+            umfpack_di_free_symbolic(&UMFPACK_Symbolic);
+            umfpack_di_free_numeric(&UMFPACK_Numeric);
+        }
+
+
+        // Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
+        // y -= (C^T * (D^-1 * (B * x)))
+        void MultisegmentWellContribution::apply(double *d_x, double *d_y)
+        {
+            // copy vectors x and y from GPU to CPU
+            cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+            cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+            cudaStreamSynchronize(stream);
+
+            // reset z1 and z2
+            std::fill(z1.begin(), z1.end(), 0.0);
+            std::fill(z2.begin(), z2.end(), 0.0);
+
+            // z1 = B * x
+            for (unsigned int row = 0; row < Mb; ++row) {
+                // for every block in the row
+                for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
+                    unsigned int colIdx = Bcols[blockID];
+                    for (unsigned int j = 0; j < dim_wells; ++j) {
+                        double temp = 0.0;
+                        for (unsigned int k = 0; k < dim; ++k) {
+                            temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
+                        }
+                        z1[row * dim_wells + j] += temp;
+                    }
+                }
+            }
+
+            // z2 = D^-1 * (B * x)
+            // umfpack
+            umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
+
+            // y -= (C^T * z2)
+            // y -= (C^T * (D^-1 * (B * x)))
+            for (unsigned int row = 0; row < Mb; ++row) {
+                // for every block in the row
+                for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
+                    unsigned int colIdx = Bcols[blockID];
+                    for (unsigned int j = 0; j < dim; ++j) {
+                        double temp = 0.0;
+                        for (unsigned int k = 0; k < dim_wells; ++k) {
+                            temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
+                        }
+                        h_y[colIdx * dim + j] -= temp;
+                    }
+                }
+            }
+
+            // copy vector y from CPU to GPU
+            cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
+            cudaStreamSynchronize(stream);
+        }
+
+        void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
+        {
+            stream = stream_;
+        }
+
+
+} //namespace Opm
+
diff --git a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
new file mode 100644
index 000000000..0a040d6aa
--- /dev/null
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@@ -0,0 +1,114 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef MULTISEGMENTWELLCONTRIBUTION_HEADER_INCLUDED
+#define MULTISEGMENTWELLCONTRIBUTION_HEADER_INCLUDED
+
+#include <config.h>
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+namespace Opm
+{
+
+    /// This class serves to duplicate the functionality of the MultisegmentWell
+    /// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
+    /// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
+    /// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq. 
+    /// B*x and D*B*x are a vector with M*numWellEq doubles
+    /// C*D*B*x is a vector with N*numEq doubles.
+    ///
+    /// This class is used in 3 phases:
+    /// - get total size of all wellcontributions that must be stored here
+    /// - allocate memory
+    /// - copy data of wellcontributions
+    class MultisegmentWellContribution
+    {
+
+    private:
+        unsigned int dim;                        // number of columns of blocks in B and C, equal to MultisegmentWell::numEq
+        unsigned int dim_wells;                  // number of rows of blocks in B and C, equal to MultisegmentWell::numWellEq
+        unsigned int N;                          // number of rows in vectors x and y, N == dim*Nb
+        unsigned int Nb;                         // number of blockrows in x and y
+        unsigned int M;                          // number of rows, M == dim_wells*Mb
+        unsigned int Mb;                         // number of blockrows in C, D and B
+
+        cudaStream_t stream;
+        double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+
+        // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack).
+        unsigned int DnumBlocks;
+        unsigned int BnumBlocks;
+        std::vector<double> Cvals;
+        std::vector<double> Dvals;
+        std::vector<double> Bvals;
+        std::vector<unsigned int> Ccols;
+        std::vector<int> Dcols;              // Columnpointers, contains M+1 entries
+        std::vector<unsigned int> Bcols;
+        std::vector<unsigned int> Crows;
+        std::vector<int> Drows;              // Rowindicies, contains DnumBlocks*dim*dim_wells entries
+        std::vector<unsigned int> Brows;
+        std::vector<double> z1;          // z1 = B * x
+        std::vector<double> z2;          // z2 = D^-1 * B * x
+        void *UMFPACK_Symbolic, *UMFPACK_Numeric;
+
+
+    public:
+
+        /// Set a cudaStream to be used
+        /// \param[in] stream           the cudaStream that is used
+        void setCudaStream(cudaStream_t stream);
+
+        /// Create a new MultisegmentWellContribution
+        /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
+        /// \param[in] dim              size of blocks in vectors x and y, equal to MultisegmentWell::numEq
+        /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
+        /// \param[in] Nb               number of blocks in vectors x and y
+        /// \param[in] Mb               number of blockrows in C, B and D
+        /// \param[in] BnumBlocks       number of blocks in C and B
+        /// \param[in] Bvalues          nonzero values of matrix B
+        /// \param[in] BcolIndices      columnindices of blocks of matrix B
+        /// \param[in] BrowPointers     rowpointers of matrix B
+        /// \param[in] DnumBlocks       number of blocks in D
+        /// \param[in] Dvalues          nonzero values of matrix D
+        /// \param[in] DcolPointers     columnpointers of matrix D
+        /// \param[in] DrowIndices      rowindices of matrix D
+        /// \param[in] Cvalues          nonzero values of matrix C
+        MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+            unsigned int Nb, unsigned int Mb,
+            unsigned int BnumBlocks, double *Bvalues, unsigned int *BcolIndices, unsigned int *BrowPointers,
+            unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+            double *Cvalues);
+
+        /// Destroy a MultisegmentWellContribution, and free memory
+        ~MultisegmentWellContribution();
+
+        /// Apply the MultisegmentWellContribution on GPU
+        /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
+        /// \param[in] d_x          vector x, must be on GPU
+        /// \param[inout] d_y       vector y, must be on GPU
+        void apply(double *d_x, double *d_y);
+
+    };
+
+} //namespace Opm
+
+#endif
diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu
index ac79bb21c..d3ee09a05 100644
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@@ -128,27 +128,39 @@ namespace Opm
     }
 
 
-    void WellContributions::alloc(){
-        cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-        cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
-        cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-        cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-        cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-        val_pointers = new unsigned int[num_wells + 1];
-        cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
-        cudaCheckLastError("apply_gpu malloc failed");
-        allocated = true;
+    void WellContributions::alloc()
+    {
+        if (num_std_wells > 0) {
+            cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+            cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+            cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+            cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+            cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+            val_pointers = new unsigned int[num_std_wells + 1];
+            cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
+            cudaCheckLastError("apply_gpu malloc failed");
+            allocated = true;
+        }
     }
 
     WellContributions::~WellContributions()
     {
-        cudaFree(d_Cnnzs);
-        cudaFree(d_Dnnzs);
-        cudaFree(d_Bnnzs);
-        cudaFree(d_Ccols);
-        cudaFree(d_Bcols);
-        delete[] val_pointers;
-        cudaFree(d_val_pointers);
+        // delete MultisegmentWellContributions
+        for (auto ms : multisegments) {
+            delete ms;
+        }
+        multisegments.clear();
+
+        // delete data for StandardWell
+        if (num_std_wells > 0) {
+            cudaFree(d_Cnnzs);
+            cudaFree(d_Dnnzs);
+            cudaFree(d_Bnnzs);
+            cudaFree(d_Ccols);
+            cudaFree(d_Bcols);
+            delete[] val_pointers;
+            cudaFree(d_val_pointers);
+        }
     }
 
 
@@ -156,8 +168,16 @@ namespace Opm
     // y -= (C^T *(D^-1*(   B*x)))
     void WellContributions::apply(double *d_x, double *d_y)
     {
-        int smem_size = 2 * sizeof(double) * dim_wells;
-        apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+        // apply MultisegmentWells
+        for(MultisegmentWellContribution *well : multisegments){
+            well->apply(d_x, d_y);
+        }
+
+        // apply StandardWells
+        if (num_std_wells > 0) {
+            int smem_size = 2 * sizeof(double) * dim_wells;
+            apply_well_contributions<<<num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+        }
     }
 
 
@@ -173,16 +193,16 @@ namespace Opm
             cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
             break;
         case MatrixType::D:
-            cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
+            cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
             break;
         case MatrixType::B:
             cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
             cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-            val_pointers[num_wells_so_far] = num_blocks_so_far;
-            if(num_wells_so_far == num_wells - 1){
-                val_pointers[num_wells] = num_blocks;
+            val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+            if(num_std_wells_so_far == num_std_wells - 1){
+                val_pointers[num_std_wells] = num_blocks;
             }
-            cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
+            cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells+1), cudaMemcpyHostToDevice);
             break;
         default:
             OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
@@ -190,13 +210,16 @@ namespace Opm
         cudaCheckLastError("WellContributions::addMatrix() failed");
         if (MatrixType::B == type) {
             num_blocks_so_far += val_size;
-            num_wells_so_far++;
+            num_std_wells_so_far++;
         }
     }
 
     void WellContributions::setCudaStream(cudaStream_t stream_)
     {
         this->stream = stream_;
+        for(MultisegmentWellContribution *well : multisegments){
+            well->setCudaStream(stream_);
+        }
     }
 
     void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
@@ -211,7 +234,18 @@ namespace Opm
             OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
         }
         num_blocks += nnz;
-        num_wells++;
+        num_std_wells++;
+    }
+
+    void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+        unsigned int Nb, unsigned int Mb,
+        unsigned int BnumBlocks, double *Bvalues, unsigned int *BcolIndices, unsigned int *BrowPointers,
+        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+        double *Cvalues)
+    {
+        MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
+        multisegments.emplace_back(well);
+        ++num_ms_wells;
     }
 
 } //namespace Opm
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index 74f7efc09..1ab270ca8 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -28,6 +28,8 @@
 
 #include <vector>
 
+#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
+
 #include <cuda_runtime.h>
 
 namespace Opm
@@ -35,6 +37,10 @@ namespace Opm
 
     /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
     /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
+    /// So far, StandardWell and MultisegmentWell are supported
+    /// A single instance (or pointer) of this class is passed to the cusparseSolver.
+    /// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.
+
     /// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
     /// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
     /// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
@@ -50,14 +56,18 @@ namespace Opm
 
     private:
         unsigned int num_blocks = 0;             // total number of blocks in all wells
-        unsigned int dim;                        // number of columns of blocks in B and C, equal to StandardWell::numEq
-        unsigned int dim_wells;                  // number of rows of blocks in B and C, equal to StandardWell::numStaticWellEq
-        unsigned int num_wells = 0;              // number of wellcontributions in this object
+        unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
+        unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
+        unsigned int num_std_wells = 0;          // number of StandardWells in this object
+        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
         unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
-        unsigned int num_wells_so_far = 0;       // keep track of where next data is written
+        unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
         unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
         bool allocated = false;
+        std::vector<MultisegmentWellContribution*> multisegments;
+        cudaStream_t stream;
 
+        // data for StandardWells, could remain nullptrs if not used
         double *d_Cnnzs = nullptr;
         double *d_Dnnzs = nullptr;
         double *d_Bnnzs = nullptr;
@@ -66,7 +76,7 @@ namespace Opm
         double *d_z1 = nullptr;
         double *d_z2 = nullptr;
         unsigned int *d_val_pointers = nullptr;
-        cudaStream_t stream;
+
     public:
 
         /// StandardWell has C, D and B matrices that need to be copied
@@ -86,22 +96,22 @@ namespace Opm
         /// Destroy a WellContributions, and free memory
         ~WellContributions();
 
-        /// Apply all wellcontributions in this object
-        /// performs y -= (C^T * (D^-1 * (B*x))) for StandardWell
+        /// Apply all Wells in this object
+        /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
         /// \param[in] x          vector x
         /// \param[inout] y       vector y
         void apply(double *x, double *y);
 
-        /// Allocate memory for the wellcontributions
+        /// Allocate memory for the StandardWells
         void alloc();
 
-        /// Indicate how large the blocks of the wellcontributions (C and B) are
+        /// Indicate how large the blocks of the StandardWell (C and B) are
         /// \param[in] dim         number of columns
         /// \param[in] dim_wells   number of rows
         void setBlockSize(unsigned int dim, unsigned int dim_wells);
 
-        /// Indicate how large the next wellcontribution is, this function cannot be called after alloc() is called
-        /// \param[in] numBlocks   number of blocks in C and B of next wellcontribution
+        /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
+        /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
         void addNumBlocks(unsigned int numBlocks);
 
         /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
@@ -111,10 +121,31 @@ namespace Opm
         /// \param[in] val_size    number of blocks in C or B, ignored for D
         void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
 
+        /// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
+        /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
+        /// \param[in] dim              size of blocks in vectors x and y, equal to MultisegmentWell::numEq
+        /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
+        /// \param[in] Nb               number of blocks in vectors x and y
+        /// \param[in] Mb               number of blockrows in C, B and D
+        /// \param[in] BnumBlocks       number of blocks in C and B
+        /// \param[in] Bvalues          nonzero values of matrix B
+        /// \param[in] BcolIndices      columnindices of blocks of matrix B
+        /// \param[in] BrowPointers     rowpointers of matrix B
+        /// \param[in] DnumBlocks       number of blocks in D
+        /// \param[in] Dvalues          nonzero values of matrix D
+        /// \param[in] DcolPointers     columnpointers of matrix D
+        /// \param[in] DrowIndices      rowindices of matrix D
+        /// \param[in] Cvalues          nonzero values of matrix C
+        void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+            unsigned int Nb, unsigned int Mb,
+            unsigned int BnumBlocks, double *Bvalues, unsigned int *BcolIndices, unsigned int *BrowPointers,
+            unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+            double *Cvalues);
+
         /// Return the number of wells added to this object
         /// \return the number of wells added to this object
         unsigned int getNumWells(){
-            return num_wells;
+            return num_std_wells + num_ms_wells;
         }
     };
 
diff --git a/opm/simulators/linalg/bda/cuda_header.hpp b/opm/simulators/linalg/bda/cuda_header.hpp
index 7d1f722d4..4ca6e3b77 100644
--- a/opm/simulators/linalg/bda/cuda_header.hpp
+++ b/opm/simulators/linalg/bda/cuda_header.hpp
@@ -20,6 +20,7 @@
 #ifndef CUDA_HEADER_HEADER_INCLUDED
 #define CUDA_HEADER_HEADER_INCLUDED
 
+#include <cuda_runtime.h>
 #include <iostream>
 
 /// Runtime error checking of CUDA functions
diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
index b8648ef15..601efa811 100644
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@@ -74,9 +74,9 @@ private:
     void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
-    /// \param[in] N         number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] nnz         number of nonzeroes, divide by dim*dim to get number of blocks
-    /// \param[in] dim         size of block
+    /// \param[in] N                number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz              number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim              size of block
     void initialize(int N, int nnz, int dim);
 
     /// Clean memory
diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp
index 5dc7d4043..a78de1b52 100644
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@@ -907,22 +907,34 @@ namespace Opm {
     BlackoilWellModel<TypeTag>::
     getWellContributions(WellContributions& wellContribs) const
     {
+        // prepare for StandardWells
         wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
         for(unsigned int i = 0; i < well_container_.size(); i++){
             auto& well = well_container_[i];
             std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
-            unsigned int numBlocks;
-            derived->getNumBlocks(numBlocks);
-            wellContribs.addNumBlocks(numBlocks);
+            if (derived) {
+                unsigned int numBlocks;
+                derived->getNumBlocks(numBlocks);
+                wellContribs.addNumBlocks(numBlocks);
+            }
         }
+
+        // allocate memory for data from StandardWells
         wellContribs.alloc();
+
         for(unsigned int i = 0; i < well_container_.size(); i++){
             auto& well = well_container_[i];
-            std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
-            if (derived) {
-                derived->addWellContribution(wellContribs);
+            // maybe WellInterface could implement addWellContribution()
+            auto derived_std = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
+            if (derived_std) {
+                derived_std->addWellContribution(wellContribs);
             } else {
-                OpmLog::warning("Warning only StandardWell is supported by WellContributions for GPU");
+                auto derived_ms = std::dynamic_pointer_cast<MultisegmentWell<TypeTag> >(well);
+                if (derived_ms) {
+                    derived_ms->addWellContribution(wellContribs);
+                } else {
+                    OpmLog::warning("Warning unknown type of well");
+                }
             }
         }
     }
diff --git a/opm/simulators/wells/MultisegmentWell.hpp b/opm/simulators/wells/MultisegmentWell.hpp
index 8644ae505..9cc9b12f9 100644
--- a/opm/simulators/wells/MultisegmentWell.hpp
+++ b/opm/simulators/wells/MultisegmentWell.hpp
@@ -135,6 +135,11 @@ namespace Opm
         /// r = r - C D^-1 Rw
         virtual void apply(BVector& r) const override;
 
+#if HAVE_CUDA
+        /// add the contribution (C, D, B matrices) of this Well to the WellContributions object
+        void addWellContribution(WellContributions& wellContribs) const;
+#endif
+
         /// using the solution x to recover the solution xw for wells and applying
         /// xw to update Well State
         virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,
diff --git a/opm/simulators/wells/MultisegmentWell_impl.hpp b/opm/simulators/wells/MultisegmentWell_impl.hpp
index 5062641cd..b037e4cee 100644
--- a/opm/simulators/wells/MultisegmentWell_impl.hpp
+++ b/opm/simulators/wells/MultisegmentWell_impl.hpp
@@ -639,6 +639,66 @@ namespace Opm
 
 
 
+#if HAVE_CUDA
+    template<typename TypeTag>
+    void
+    MultisegmentWell<TypeTag>::
+    addWellContribution(WellContributions& wellContribs) const
+    {
+        unsigned int Nb = duneB_.M();       // number of blockrows in matrix A
+        unsigned int Mb = duneB_.N();       // number of blockrows in duneB_, duneC_ and duneD_
+        unsigned int BnumBlocks = duneB_.nonzeroes();
+        unsigned int DnumBlocks = duneD_.nonzeroes();
+
+        // duneC
+        std::vector<unsigned int> Ccols;
+        std::vector<double> Cvals;
+        Ccols.reserve(BnumBlocks);
+        Cvals.reserve(BnumBlocks * numEq * numWellEq);
+        for (auto rowC = duneC_.begin(); rowC != duneC_.end(); ++rowC) {
+            for (auto colC = rowC->begin(), endC = rowC->end(); colC != endC; ++colC) {
+                Ccols.emplace_back(colC.index());
+                for (int i = 0; i < numWellEq; ++i) {
+                    for (int j = 0; j < numEq; ++j) {
+                        Cvals.emplace_back((*colC)[i][j]);
+                    }
+                }
+            }
+        }
+
+        // duneD
+        Dune::UMFPack<DiagMatWell> umfpackMatrix(duneD_, 0);
+        double *Dvals = umfpackMatrix.getInternalMatrix().getValues();
+        int *Dcols = umfpackMatrix.getInternalMatrix().getColStart();
+        int *Drows = umfpackMatrix.getInternalMatrix().getRowIndex();
+
+        // duneB
+        std::vector<unsigned int> Bcols;
+        std::vector<unsigned int> Brows;
+        std::vector<double> Bvals;
+        Bcols.reserve(BnumBlocks);
+        Brows.reserve(Mb+1);
+        Bvals.reserve(BnumBlocks * numEq * numWellEq);
+        Brows.emplace_back(0);
+        unsigned int sumBlocks = 0;
+        for (auto rowB = duneB_.begin(); rowB != duneB_.end(); ++rowB) {
+            int sizeRow = 0;
+            for (auto colB = rowB->begin(), endB = rowB->end(); colB != endB; ++colB) {
+                Bcols.emplace_back(colB.index());
+                for (int i = 0; i < numWellEq; ++i) {
+                    for (int j = 0; j < numEq; ++j) {
+                        Bvals.emplace_back((*colB)[i][j]);
+                    }
+                }
+                sizeRow++;
+            }
+            sumBlocks += sizeRow;
+            Brows.emplace_back(sumBlocks);
+        }
+
+        wellContribs.addMultisegmentWellContribution(numEq, numWellEq, Nb, Mb, BnumBlocks, Bvals.data(), Bcols.data(), Brows.data(), DnumBlocks, Dvals, Dcols, Drows, Cvals.data());
+    }
+#endif
 
 
     template <typename TypeTag>