diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 565d82ae0..100b4fc21 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -60,6 +60,7 @@ if(OPENCL_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
 endif()
 if(MPI_FOUND)
diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index b8ec7941e..342d06352 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -40,33 +40,6 @@ WellContributions::WellContributions(std::string gpu_mode){
     }
 }
 
-void WellContributions::alloc()
-{
-    if (num_std_wells > 0) {
-#if HAVE_CUDA
-        if(cuda_gpu){
-            allocStandardWells();
-        }
-#endif
-
-#if HAVE_OPENCL
-        if(opencl_gpu){
-            h_Cnnzs_ocl = new double[num_blocks * dim * dim_wells];
-            h_Dnnzs_ocl = new double[num_std_wells * dim_wells * dim_wells];
-            h_Bnnzs_ocl = new double[num_blocks * dim * dim_wells];
-            h_Ccols_ocl = new int[num_blocks];
-            h_Bcols_ocl = new int[num_blocks];
-            val_pointers = new unsigned int[num_std_wells + 1];
-
-            allocated = true;
-        }
-#endif
-
-#if !HAVE_CUDA && !HAVE_OPENCL
-        OPM_THROW(std::logic_error, "Error cannot allocate on GPU because neither CUDA nor OpenCL were found by cmake");
-#endif
-    }
-}
 
 WellContributions::~WellContributions()
 {
@@ -81,24 +54,6 @@ WellContributions::~WellContributions()
         freeCudaMemory(); // should come before 'delete[] h_x'
     }
 #endif
-
-#if HAVE_OPENCL
-    if (h_x_ocl) {
-        delete[] h_x_ocl;
-        delete[] h_y_ocl;
-    }
-
-    if(opencl_gpu){
-        if(num_std_wells > 0){
-            delete[] h_Cnnzs_ocl;
-            delete[] h_Dnnzs_ocl;
-            delete[] h_Bnnzs_ocl;
-            delete[] h_Ccols_ocl;
-            delete[] h_Bcols_ocl;
-            delete[] val_pointers;
-        }
-    }
-#endif
 }
 
 /*
@@ -106,23 +61,20 @@ WellContributions::~WellContributions()
 void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
     // apply MultisegmentWells
     if (num_ms_wells > 0) {
-        // allocate pinned memory on host if not yet done
-        if (h_x_ocl == nullptr) {
-            h_x_ocl = new double[N];
-            h_y_ocl = new double[N];
-        }
+        h_x_ocl.reserve(N);
+        h_y_ocl.reserve(N);
 
         // copy vectors x and y from GPU to CPU
-        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl);
-        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl.data());
+        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl.data());
 
         // actually apply MultisegmentWells
         for (MultisegmentWellContribution *well : multisegments) {
-            well->apply(h_x_ocl, h_y_ocl);
+            well->apply(h_x_ocl.data(), h_y_ocl.data());
         }
 
         // copy vector y from CPU to GPU
-        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
+        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl.data());
     }
 }
 #endif
@@ -130,12 +82,12 @@ void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
 
 void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
 {
-    if (!allocated) {
-        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
-    }
 
 #if HAVE_CUDA
     if(cuda_gpu){
+        if (!allocated) {
+            OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
+        }
         addMatrixGpu(type, colIndices, values, val_size);
     }
 #endif
@@ -143,32 +95,29 @@ void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unus
 #if HAVE_OPENCL
     if(opencl_gpu){
         switch (type) {
-            case MatrixType::C:
-                std::copy(colIndices, colIndices + val_size, h_Ccols_ocl + num_blocks_so_far);
-                std::copy(values, values + val_size*dim*dim_wells, h_Cnnzs_ocl + num_blocks_so_far*dim*dim_wells);
-                break;
+        case MatrixType::C:
+            h_Ccols_ocl.insert(h_Ccols_ocl.end(), colIndices, colIndices + val_size);
+            h_Cnnzs_ocl.insert(h_Cnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
+            break;
 
-            case MatrixType::D:
-                std::copy(values, values + dim_wells*dim_wells, h_Dnnzs_ocl + num_std_wells_so_far*dim_wells*dim_wells);
-                break;
+        case MatrixType::D:
+            h_Dnnzs_ocl.insert(h_Dnnzs_ocl.end(), values, values + dim_wells * dim_wells);
+            break;
 
-            case MatrixType::B:
-                std::copy(colIndices, colIndices + val_size, h_Bcols_ocl + num_blocks_so_far);
-                std::copy(values, values + val_size*dim*dim_wells, h_Bnnzs_ocl + num_blocks_so_far*dim*dim_wells);
-                val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+        case MatrixType::B:
+            h_Bcols_ocl.insert(h_Bcols_ocl.end(), colIndices, colIndices + val_size);
+            h_Bnnzs_ocl.insert(h_Bnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
 
-                if(num_std_wells_so_far == num_std_wells - 1){
-                    val_pointers[num_std_wells] = num_blocks;
-                }
-                break;
+            if(h_val_pointers_ocl.empty()){
+                h_val_pointers_ocl.push_back(0);
+            }
+            else{
+                h_val_pointers_ocl.push_back(h_val_pointers_ocl.back() + val_size);
+            }
+            break;
 
-            default:
-                OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
-        }
-
-        if (MatrixType::B == type) {
-            num_blocks_so_far += val_size;
-            num_std_wells_so_far++;
+        default:
+            OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
         }
     }
 
@@ -186,6 +135,7 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
     dim_wells = dim_wells_;
 }
 
+#if HAVE_CUDA
 void WellContributions::addNumBlocks(unsigned int numBlocks)
 {
     if (allocated) {
@@ -195,6 +145,15 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
     num_std_wells++;
 }
 
+void WellContributions::alloc()
+{
+    if (num_std_wells > 0) {
+        allocStandardWells();
+        allocated = true;
+    }
+}
+#endif
+
 void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
         unsigned int Nb, unsigned int Mb,
         unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu
index bf9417a98..66e7abbc6 100644
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@@ -136,7 +136,6 @@ void WellContributions::allocStandardWells()
         val_pointers = new unsigned int[num_std_wells + 1];
         cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
         cudaCheckLastError("apply_gpu malloc failed");
-        allocated = true;
     }
 }
 
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index 1467cbee7..6f8b856b9 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -55,7 +55,6 @@ namespace Opm
 /// - copy data of wellcontributions
 class WellContributions
 {
-
 public:
     /// StandardWell has C, D and B matrices that need to be copied
     enum class MatrixType {
@@ -64,28 +63,18 @@ public:
         B
     };
 
-    unsigned int num_blocks = 0;             // total number of blocks in all wells
-    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
-    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
-    unsigned int num_std_wells = 0;          // number of StandardWells in this object
-    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-    unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
-    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
-    unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
-
 #if HAVE_OPENCL
-    double *h_Cnnzs_ocl = nullptr;
-    double *h_Dnnzs_ocl = nullptr;
-    double *h_Bnnzs_ocl = nullptr;
-    int *h_Ccols_ocl = nullptr;
-    int *h_Bcols_ocl = nullptr;
-    double *h_x_ocl = nullptr;
-    double *h_y_ocl = nullptr;
+    std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
+    std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
+    std::vector<unsigned int> h_val_pointers_ocl;
+    std::vector<double> h_x_ocl, h_y_ocl;
 #endif
 
 private:
+    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
+    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
+    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
     unsigned int N;                          // number of rows (not blockrows) in vectors x and y
-    bool allocated = false;
     std::vector<MultisegmentWellContribution*> multisegments;
 
     int *toOrder = nullptr;
@@ -95,6 +84,12 @@ private:
     bool cuda_gpu = false;
 
 #if HAVE_CUDA
+    bool allocated = false;
+    unsigned int num_blocks = 0;             // total number of blocks in all wells
+    unsigned int num_std_wells = 0;          // number of StandardWells in this object
+    unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
+    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
+    unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
     cudaStream_t stream;
 
     // data for StandardWells, could remain nullptrs if not used
@@ -142,6 +137,13 @@ public:
     unsigned int getNumWells(){
         return num_std_wells + num_ms_wells;
     }
+
+    /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
+    /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
+    void addNumBlocks(unsigned int numBlocks);
+
+    /// Allocate memory for the StandardWells
+    void alloc();
 #endif
 
     /// Create a new WellContributions
@@ -150,18 +152,11 @@ public:
     /// Destroy a WellContributions, and free memory
     ~WellContributions();
 
-
-    /// Allocate memory for the StandardWells
-    void alloc();
-
     /// Indicate how large the blocks of the StandardWell (C and B) are
     /// \param[in] dim         number of columns
     /// \param[in] dim_wells   number of rows
     void setBlockSize(unsigned int dim, unsigned int dim_wells);
 
-    /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
-    /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
-    void addNumBlocks(unsigned int numBlocks);
 
     /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
     /// \param[in] type        indicate if C, D or B is sent
diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
new file mode 100644
index 000000000..716d05e57
--- /dev/null
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
@@ -0,0 +1,102 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
+
+
+namespace bda
+{
+    using Opm::OpmLog;
+    using Dune::Timer;
+
+    void WellContributionsOCLContainer::initBuffers(WellContributions &wellContribs)
+    {
+        dim = wellContribs.dim;
+        dim_wells = wellContribs.dim_wells;
+        num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
+        toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + wellContribs.h_Ccols_ocl.size());
+
+        s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
+        s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
+        s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
+        s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
+        s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
+        s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
+        s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * toOrder.size());
+    }
+
+    void WellcontributionsOCLContainer::copy_to_gpu(WellContributions &wellContribs){
+        cl::Event event;
+        queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data());
+        queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data());
+        queue->enqueueWriteBuffer(s.Bnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data());
+        queue->enqueueWriteBuffer(s.Ccols, CL_TRUE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data());
+        queue->enqueueWriteBuffer(s.Bcols, CL_TRUE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data());
+        queue->enqueueWriteBuffer(s.val_pointers, CL_TRUE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data());
+        queue->enqueueWriteBuffer(s.toOrder, CL_TRUE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &event);
+        event.wait();
+    }
+
+    void WellcontributionsOCLContainer::update_on_gpu(WellContributions &wellContribs){
+        cl::Event event;
+        queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data());
+        queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data());
+        queue->enqueueWriteBuffer(s.Bnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &event);
+        event.wait();
+    }
+
+    void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){
+        this->context = context_;
+    }
+
+    void WellContributionsOCLContainer::setOpenCLQueue(cl::CommandQueue *queue_){
+        this->queue = queue_;
+    }
+
+    void WellContributionsOCLContainer::setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                                  const unsigned int, const unsigned int, cl::Buffer&,
+                                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_){
+        this->stdwell_apply = stdwell_apply_;
+    }
+
+    void WellContributionsOCLContainer::applyStdWells(cl::Buffer& x, cl::Buffer& y){
+        const unsigned int work_group_size = 32;
+        const unsigned int total_work_items = num_std_wells * work_group_size;
+        const unsigned int lmem1 = sizeof(double) * work_group_size;
+        const unsigned int lmem2 = sizeof(double) * dim_wells;
+
+        cl::Event event;
+        event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                                 s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, s.toOrder,x, y, dim, dim_wells, s.val_pointers,
+                                 cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+    }
+
+    void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){
+        if(num_std_wells > 0){
+            applyStdWells(x, y);
+        }
+    }
+} // end namespace bda
diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
new file mode 100644
index 000000000..70e85fb90
--- /dev/null
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
@@ -0,0 +1,70 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WELLCONTRIBUTIONSOCLCONTAINER_HPP
+#define WELLCONTRIBUTIONSOCLCONTAINER_HPP
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+namespace bda
+{
+    class WellContributionsOCLContainer
+    {
+    private:
+        unsigned int dim, dim_wells;
+        unsigned int num_std_wells = 0;
+        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
+        std::vector<int> toOrder;
+
+        typedef struct {
+            cl::Buffer Cnnzs, Dnnzs, Bnnzs;
+            cl::Buffer Ccols, Bcols;
+            cl::Buffer val_pointers, toOrder;
+        } GPU_storage;
+
+        GPU_storage s;
+        cl::Context *context;
+        cl::CommandQueue *queue;
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                        cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                        const unsigned int, const unsigned int, cl::Buffer&,
+                        cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply;
+
+        void applyStdWells(cl::Buffer& x, cl::Buffer& y);
+
+    public:
+        WellContributionsOCLContainer();
+        ~WellContributionsOCLContainer();
+
+        void apply(cl::Buffer& x, cl::Buffer& y);
+        void initBuffers(WellContributions &wellContribs);
+        void copy_to_gpu(WellContributions &wellContribs);
+        void update_on_gpu(WellContributions &wellContribs);
+        void setOpenCLContext(cl::Context *context);
+        void setOpenCLQueue(cl::CommandQueue *queue);
+        void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
+        void setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                       cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                       const unsigned int, const unsigned int, cl::Buffer&,
+                                       cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_);
+    };
+} // end namespace bda
+
+#endif
diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp
index 933634a9f..d7364c9ec 100644
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -366,96 +366,78 @@ namespace bda
     }
     )";
 
-    inline const char* add_well_contributions_s = R"(
-    #pragma OPENCL EXTENSION cl_khr_fp64: enable
-    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
-
-    void atomicAdd(volatile __global double *val, const double delta){
-        union{
-            double f;
-            ulong i;
-        } old;
-
-        union{
-            double f;
-            ulong i;
-        } new;
-
-        do{
-            old.f = *val;
-            new.f = old.f + delta;
-        } while(atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i);
-    }
-
-    __kernel void add_well_contributions(__global const double *valsC,
-                                         __global const double *valsD,
-                                         __global const double *valsB,
-                                         __global const int *colsC,
-                                         __global const int *colsB,
-                                         __global const double *x,
-                                         __global double *y,
-                                         const unsigned int blnc,
-                                         const unsigned int blnr,
-                                         __global const unsigned int *rowptr,
-                                         __local double *localSum,
-                                         __local double *z1,
-                                         __local double *z2){
+    inline const char* stdwell_apply_s = R"(
+    __kernel void stdwell_apply(__global const double *Cnnzs,
+                                __global const double *Dnnzs,
+                                __global const double *Bnnzs,
+                                __global const int *Ccols,
+                                __global const int *Bcols,
+                                __global const double *x,
+                                __global double *y,
+                                __global const int *toOrder,
+                                const unsigned int dim,
+                                const unsigned int dim_wells,
+                                __global const unsigned int *val_pointers,
+                                __local double *localSum,
+                                __local double *z1,
+                                __local double *z2){
         int wgId = get_group_id(0);
         int wiId = get_local_id(0);
-        int valSize = rowptr[wgId + 1] - rowptr[wgId];
-        int valsPerBlock = blnc*blnr;
+        int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+        int valsPerBlock = dim*dim_wells;
         int numActiveWorkItems = (32/valsPerBlock)*valsPerBlock;
         int numBlocksPerWarp = 32/valsPerBlock;
-        int c = wiId % blnc;
-        int r = (wiId/blnc) % blnr;
+        int c = wiId % dim;
+        int r = (wiId/dim) % dim_wells;
         double temp;
 
+        barrier(CLK_LOCAL_MEM_FENCE);
+
         localSum[wiId] = 0;
         if(wiId < numActiveWorkItems){
-            int b = wiId/valsPerBlock + rowptr[wgId];
-            while(b < valSize + rowptr[wgId]){
-                int colIdx = colsB[b];
-                localSum[wiId] += valsB[b*blnc*blnr + r*blnc + c]*x[colIdx*blnc + c];
+            int b = wiId/valsPerBlock + val_pointers[wgId];
+            while(b < valSize + val_pointers[wgId]){
+                int colIdx = toOrder[Bcols[b]];
+                localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
                 b += numBlocksPerWarp;
             }
-        }
 
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int stride = valsPerBlock;
-        if(wiId < stride){
-            localSum[wiId] += localSum[wiId + stride];
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(c == 0 && wiId < valsPerBlock){
-            for(stride = 2; stride > 0; stride /= 2){
-                localSum[wiId] += localSum[wiId + stride];
+            if(wiId < valsPerBlock){
+                localSum[wiId] += localSum[wiId + valsPerBlock];
+            }
+
+            b = wiId/valsPerBlock + val_pointers[wgId];
+
+            if(wiId < valsPerBlock){
+                if(c == 0 || c == 2) {localSum[wiId] += localSum[wiId + 2];}
+                if(c == 0 || c == 1) {localSum[wiId] += localSum[wiId + 1];}
+            }
+
+            if(c == 0 && wiId < valsPerBlock){
+                z1[r] = localSum[wiId];
             }
-            z1[r] = localSum[wiId];
         }
 
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        if(wiId < blnr){
+        if(wiId < dim_wells){
             temp = 0.0;
-            for(unsigned int i = 0; i < blnr; ++i){
-                temp += valsD[wgId*blnr*blnr + wiId*blnr + i]*z1[i];
+            for(unsigned int i = 0; i < dim_wells; ++i){
+                temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
             }
             z2[wiId] = temp;
         }
 
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        if(wiId < blnc*valSize){
+        if(wiId < dim*valSize){
             temp = 0.0;
-            int bb = wiId/blnc + rowptr[wgId];
-            int colIdx = colsC[bb];
-            for (unsigned int j = 0; j < blnr; ++j){
-                temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j];
+            int bb = wiId/dim + val_pointers[wgId];
+            for (unsigned int j = 0; j < dim_wells; ++j){
+                temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
             }
-            atomicAdd(&y[colIdx*blnc + c], -temp);
+            colIdx = toOrder[Ccols[bb]];
+            y[colIdx*dim + c] -= temp;
         }
     }
     )";
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index 749ddb905..50a549faf 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -51,6 +51,7 @@ using Dune::Timer;
 template <unsigned int block_size>
 openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
     prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
+    wcontainer = new WContainer();
 }
 
 
@@ -180,20 +181,6 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
     }
 }
 
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers)
-{
-    const unsigned int work_group_size = 32;
-    const unsigned int total_work_items = num_std_wells * work_group_size;
-    const unsigned int lmem1 = sizeof(double) * work_group_size;
-    const unsigned int lmem2 = sizeof(double) * dim_wells;
-
-    cl::Event event;
-    event = (*add_well_contributions_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                                        Cnnzs, Dnnzs, Bnnzs, Ccols, Bcols, x, y, dim_weqs, dim_wells, val_pointers,
-                                        cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
-}
-
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
     float it;
@@ -249,7 +236,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
         // v = A * pw
         t_spmv.start();
-        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
+        wcontainer->apply(d_pw, d_v);
         t_spmv.stop();
 
         // apply wellContributions
@@ -283,7 +270,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
         // apply wellContributions
         t_well.start();
-        stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_s, d_t, d_val_pointers);
+        wcontainer->apply(d_s, d_t);
         t_well.stop();
 
         t_rest.start();
@@ -331,7 +318,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
+void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs) {
     this->N = N_;
     this->nnz = nnz_;
     this->nnzb = nnz_ / block_size / block_size;
@@ -473,7 +460,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
         source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
         source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
-        source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
+        source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
         program = cl::Program(*context, source);
 
         program.build(devices);
@@ -483,6 +470,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
 
         prec->setOpenCLContext(context.get());
         prec->setOpenCLQueue(queue.get());
+        wcontainer->setOpenCLContext(context.get());
+        wcontainer->setOpenCLQueue(queue.get());
 
         rb = new double[N];
         tmp = new double[N];
@@ -507,6 +496,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
         d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
 
+        wcontainer->initBuffers(wellContribs);
+
         // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
         // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
 
@@ -518,9 +509,13 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
         ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
         ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
-        add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));
+        stdwell_apply_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                                  const unsigned int, const unsigned int, cl::Buffer&,
+                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
 
         prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
+        wcontainer->setKernel(stdwell_apply_k.get());
 
     } catch (const cl::Error& error) {
         std::ostringstream oss;
@@ -536,23 +531,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
     initialized = true;
 } // end initialize()
 
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize_wellContribs(WellContributions& wellContribs){
-    dim_weqs = wellContribs.dim;
-    dim_wells = wellContribs.dim_wells;
-    num_blocks = wellContribs.num_blocks;
-    num_std_wells = wellContribs.num_std_wells;
-
-    if(num_std_wells > 0){
-        d_Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells);
-        d_Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-        d_Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim_weqs * dim_wells);
-        d_Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-        d_Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-        d_val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
-    }
-}
-
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
     delete[] rb;
@@ -561,11 +539,12 @@ void openclSolverBackend<block_size>::finalize() {
     delete[] vals_contiguous;
 #endif
     delete prec;
+    delete wcontainer;
 } // end finalize()
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu() {
+void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &wellContribs) {
     Timer t;
     cl::Event event;
 
@@ -587,6 +566,8 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
+    wcontainer->copy_to_gpu(wellContribs);
+
     if (verbosity > 2) {
         std::ostringstream out;
         out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@@ -594,33 +575,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
     }
 } // end copy_system_to_gpu()
 
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_wells_to_gpu(WellContributions& wellContribs) {
-    wellContribs.setReordering(toOrder, true);
-
-    if(num_std_wells > 0){
-        Timer t;
-        cl::Event event;
-
-        queue->enqueueWriteBuffer(d_Cnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Cnnzs_ocl);
-        queue->enqueueWriteBuffer(d_Dnnzs, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, wellContribs.h_Dnnzs_ocl);
-        queue->enqueueWriteBuffer(d_Bnnzs, CL_TRUE, 0, sizeof(double) * num_blocks * dim_weqs * dim_wells, wellContribs.h_Bnnzs_ocl);
-        queue->enqueueWriteBuffer(d_Ccols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Ccols_ocl);
-        queue->enqueueWriteBuffer(d_Bcols, CL_TRUE, 0, sizeof(int) * num_blocks, wellContribs.h_Bcols_ocl);
-        queue->enqueueWriteBuffer(d_val_pointers, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), wellContribs.val_pointers, nullptr, &event);
-        event.wait();
-
-        if (verbosity > 2) {
-            std::ostringstream out;
-            out << "openclSolver::copy_wells_to_gpu(): " << t.stop() << " s";
-            OpmLog::info(out.str());
-        }
-    }
-}
-
 // don't copy rowpointers and colindices, they stay the same
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system_on_gpu() {
+void openclSolverBackend<block_size>::update_system_on_gpu(WellContributions &wellContribs) {
     Timer t;
     cl::Event event;
 
@@ -640,6 +597,8 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
+    wcontainer->update_on_gpu(wellContribs);
+
     if (verbosity > 2) {
         std::ostringstream out;
         out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s";
@@ -752,8 +711,7 @@ void openclSolverBackend<block_size>::get_result(double *x) {
 template <unsigned int block_size>
 SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
     if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols);
-        initialize_wellContribs(wellContribs);
+        initialize(N_, nnz_,  dim, vals, rows, cols, wellContribs);
         if (analysis_done == false) {
             if (!analyse_matrix()) {
                 return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@@ -763,14 +721,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
-        copy_system_to_gpu();
-        copy_wells_to_gpu(wellContribs);
+        copy_system_to_gpu(wellContribs);
     } else {
         update_system(vals, b);
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
-        update_system_on_gpu();
+        update_system_on_gpu(wellContribs);
     }
     solve_system(res);
     return SolverStatus::BDA_SOLVER_SUCCESS;
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index 5ed76d9a7..ec103f725 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -21,11 +21,10 @@
 #define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
 
 #include <opm/simulators/linalg/bda/opencl.hpp>
-
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
-
+#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
 #include <opm/simulators/linalg/bda/BILU0.hpp>
 
 namespace bda
@@ -35,9 +34,9 @@ namespace bda
 template <unsigned int block_size>
 class openclSolverBackend : public BdaSolver<block_size>
 {
-
     typedef BdaSolver<block_size> Base;
     typedef BILU0<block_size> Preconditioner;
+    typedef WellContributionsOCLContainer WContainer;
 
     using Base::N;
     using Base::Nb;
@@ -51,7 +50,6 @@ class openclSolverBackend : public BdaSolver<block_size>
     using Base::initialized;
 
 private:
-
     double *rb = nullptr;                 // reordered b vector, the matrix is reordered, so b must also be
     double *vals_contiguous = nullptr;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
 
@@ -64,13 +62,6 @@ private:
     cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
     double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()
 
-    unsigned int num_blocks, dim_weqs, dim_wells, num_std_wells;
-    unsigned int *h_val_pointers;
-    int *h_Ccols, *h_Bcols;
-    double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs;
-    cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs;
-    cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
-
     // shared pointers are also passed to other objects
     cl::Program program;
     std::shared_ptr<cl::Context> context;
@@ -82,9 +73,13 @@ private:
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
     std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
     std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
-    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > add_well_contributions_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                                    const unsigned int, const unsigned int, cl::Buffer&,
+                                    cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
 
     Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
+    WContainer *wcontainer = nullptr;
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
     std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
     BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix, used for spmv
@@ -148,17 +143,13 @@ private:
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
     /// \param[in] rows           array of rowPointers, contains N/dim+1 values
     /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
-
-    void initialize_wellContribs(WellContributions& wellContribs);
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs);
 
     /// Clean memory
     void finalize();
 
     /// Copy linear system to GPU
-    void copy_system_to_gpu();
-
-    void copy_wells_to_gpu(WellContributions& wellContribs);
+    void copy_system_to_gpu(WellContributions &wellContribs);
 
     /// Reorder the linear system so it corresponds with the coloring
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@@ -166,7 +157,7 @@ private:
     void update_system(double *vals, double *b);
 
     /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
-    void update_system_on_gpu();
+    void update_system_on_gpu(WellContributions &wellContribs);
 
     /// Analyse sparsity pattern to extract parallelism
     /// \return true iff analysis was successful
@@ -179,7 +170,7 @@ private:
     /// Solve linear system
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void solve_system(BdaResult &res);
+    void solve_system(WellContributions &wellContribs, BdaResult &res);
 
 public:
 
diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp
index d8df47787..20471f014 100644
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@@ -916,6 +916,8 @@ namespace Opm {
     {
         // prepare for StandardWells
         wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
+
+#if HAVE_CUDA
         for(unsigned int i = 0; i < well_container_.size(); i++){
             auto& well = well_container_[i];
             std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
@@ -928,6 +930,7 @@ namespace Opm {
 
         // allocate memory for data from StandardWells
         wellContribs.alloc();
+#endif
 
         for(unsigned int i = 0; i < well_container_.size(); i++){
             auto& well = well_container_[i];