From a9c2a684c6c1af339a4d41dbb30af0037e670da1 Mon Sep 17 00:00:00 2001
From: Jose Eduardo Bueno <ducbueno@gmail.com>
Date: Wed, 11 Nov 2020 14:34:10 -0300
Subject: [PATCH 1/2] Removed WellContributionsOCLContainer class (rebase)

---
 CMakeLists_files.cmake                        |   1 -
 opm/simulators/linalg/ISTLSolverEbos.hpp      |   7 +-
 opm/simulators/linalg/bda/BdaBridge.cpp       |   2 +-
 opm/simulators/linalg/bda/BdaBridge.hpp       |   7 +-
 opm/simulators/linalg/bda/BdaSolver.hpp       |   1 -
 .../linalg/bda/WellContributions.cpp          | 173 ++++++++--
 .../linalg/bda/WellContributions.cu           |  32 +-
 .../linalg/bda/WellContributions.hpp          |  69 ++--
 .../bda/WellContributionsOCLContainer.cpp     | 176 ----------
 .../bda/WellContributionsOCLContainer.hpp     |  78 -----
 .../linalg/bda/openclSolverBackend.cpp        | 305 +++++++++---------
 .../linalg/bda/openclSolverBackend.hpp        |  24 +-
 .../wells/BlackoilWellModel_impl.hpp          |   2 -
 13 files changed, 382 insertions(+), 495 deletions(-)
 delete mode 100644 opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
 delete mode 100644 opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index a5c7ed878..852bbe082 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -61,7 +61,6 @@ if(OPENCL_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
 endif()
 if(MPI_FOUND)
diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp
index 0cefb69eb..d86c613da 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -251,6 +251,12 @@ namespace Opm
             if (use_gpu) {
                 const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
                 WellContributions wellContribs(gpu_mode);
+#if HAVE_OPENCL
+                if(gpu_mode.compare("opencl") == 0){
+                    const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(&bdaBridge->getBackend());
+                    wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
+                }
+#endif
                 if (!useWellConn_) {
                     simulator_.problem().wellModel().getWellContributions(wellContribs);
                 }
@@ -301,7 +307,6 @@ namespace Opm
         const std::any& parallelInformation() const { return parallelInformation_; }
 
     protected:
-
         // 3x3 matrix block inversion was unstable at least 2.3 until and including
         // 2.5.0. There may still be some issue with the 4x4 matrix block inversion
         // we therefore still use the block inversion in OPM
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index 39051dcb0..71e797450 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -222,7 +222,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector
     }
 }
 
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                \
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
 template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,              \
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
 n>::BdaBridge                                                                                                                       \
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index 18e623770..962acc9b4 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -47,8 +47,8 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 class BdaBridge
 {
 private:
-    std::unique_ptr<bda::BdaSolver<block_size> > backend;
     bool use_gpu = false;
+    std::unique_ptr<bda::BdaSolver<block_size> > backend;
 
 public:
     /// Construct a BdaBridge
@@ -61,7 +61,6 @@ public:
     /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
     BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
 
-
     /// Solve linear system, A*x = b
     /// \warning Values of A might get overwritten!
     /// \param[in] mat          matrix A, should be of type Dune::BCRSMatrix
@@ -80,6 +79,10 @@ public:
         return use_gpu;
     }
 
+    const bda::BdaSolver<block_size>& getBackend() const {
+        return *backend;
+    }
+
 }; // end class BdaBridge
 
 }
diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp
index f9b0dcb7b..cef79b5e1 100644
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@@ -66,7 +66,6 @@ namespace bda
         bool initialized = false;
 
     public:
-
         /// Construct a BdaSolver, can be cusparseSolver or openclSolver
         /// \param[in] linear_solver_verbosity    verbosity of solver
         /// \param[in] maxit                      maximum number of iterations for solver
diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index e9134354b..3eca058af 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -17,15 +17,13 @@
   along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-
 #include <config.h> // CMake
 #include <cstdlib>
 #include <cstring>
-
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/bda/openclKernels.hpp>
-#include "opm/simulators/linalg/bda/WellContributions.hpp"
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
 
 namespace Opm
 {
@@ -34,75 +32,180 @@ WellContributions::WellContributions(std::string gpu_mode){
     if(gpu_mode.compare("cusparse") == 0){
         cuda_gpu = true;
     }
-
-    if(gpu_mode.compare("opencl") == 0){
+    else if(gpu_mode.compare("opencl") == 0){
         opencl_gpu = true;
     }
+    else{
+        OPM_THROW(std::logic_error, "Error: invalid GPU mode");
+    }
 }
 
-
 WellContributions::~WellContributions()
 {
-#if HAVE_CUDA
     // delete MultisegmentWellContributions
-    for (auto ms : multisegments) {
+    for (auto ms: multisegments) {
         delete ms;
     }
     multisegments.clear();
 
+#if HAVE_CUDA
     if(cuda_gpu){
         freeCudaMemory(); // should come before 'delete[] h_x'
     }
 #endif
+
+    if(num_std_wells > 0){
+        delete[] val_pointers;
+#if HAVE_OPENCL
+        if(opencl_gpu){
+            delete[] h_toOrder;
+        }
+#endif
+    }
+
+#if HAVE_OPENCL
+    if(opencl_gpu){
+        if(num_ms_wells > 0){
+            delete[] h_x;
+            delete[] h_y;
+        }
+    }
+#endif
 }
 
+#if HAVE_OPENCL
+void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){
+    this->context = context_;
+    this->queue = queue_;
+}
+
+void WellContributions::setKernel(kernel_type *kernel_){
+    this->kernel = kernel_;
+}
+
+void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    const unsigned int work_group_size = 32;
+    const unsigned int total_work_items = num_std_wells * work_group_size;
+    const unsigned int lmem1 = sizeof(double) * work_group_size;
+    const unsigned int lmem2 = sizeof(double) * dim_wells;
+
+    cl::Event event;
+    event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                      *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
+                      cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+}
+
+void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    if(h_x == nullptr){
+        h_x = new double[N];
+        h_y = new double[N];
+    }
+
+    if(h_toOrder == nullptr){
+        h_toOrder = new int[Nb];
+    }
+
+    if(!read_toOrder){
+        events.resize(1);
+        queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]);
+        events[0].wait();
+        events.clear();
+        read_toOrder = true;
+    }
+
+    events.resize(2);
+    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
+    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
+    cl::WaitForEvents(events);
+    events.clear();
+
+    // actually apply MultisegmentWells
+    for(Opm::MultisegmentWellContribution *well: multisegments){
+        well->setReordering(h_toOrder, true);
+        well->apply(h_x, h_y);
+    }
+
+    // copy vector y from CPU to GPU
+    events.resize(1);
+    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]);
+    events[0].wait();
+    events.clear();
+}
+
+void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    if(num_std_wells > 0){
+        apply_stdwells(d_x, d_y, d_toOrder);
+    }
+
+    if(num_ms_wells > 0){
+        apply_mswells(d_x, d_y, d_toOrder);
+    }
+}
+#endif
+
 void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
 {
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
+    }
 
 #if HAVE_CUDA
     if(cuda_gpu){
-        if (!allocated) {
-            OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
-        }
         addMatrixGpu(type, colIndices, values, val_size);
     }
 #endif
 
 #if HAVE_OPENCL
     if(opencl_gpu){
-        if(h_val_pointers_ocl.empty()){
-            h_val_pointers_ocl.push_back(0);
-        }
-
         switch (type) {
         case MatrixType::C:
-            h_Ccols_ocl.insert(h_Ccols_ocl.end(), colIndices, colIndices + val_size);
-            h_Cnnzs_ocl.insert(h_Cnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
+            events.resize(2);
+            queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
+            queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+            cl::WaitForEvents(events);
+            events.clear();
             break;
 
         case MatrixType::D:
-            h_Dnnzs_ocl.insert(h_Dnnzs_ocl.end(), values, values + dim_wells * dim_wells);
+            events.resize(1);
+            queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
+            events[0].wait();
+            events.clear();
             break;
 
         case MatrixType::B:
-            h_Bcols_ocl.insert(h_Bcols_ocl.end(), colIndices, colIndices + val_size);
-            h_Bnnzs_ocl.insert(h_Bnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
-            h_val_pointers_ocl.push_back(h_val_pointers_ocl.back() + val_size);
+            events.resize(2);
+            queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
+            queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+            cl::WaitForEvents(events);
+            events.clear();
+
+            val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+            if (num_std_wells_so_far == num_std_wells - 1) {
+                val_pointers[num_std_wells] = num_blocks;
+                events.resize(1);
+                queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &events[0]);
+                events[0].wait();
+                events.clear();
+            }
             break;
 
         default:
             OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
         }
     }
-
 #endif
 
+    if(MatrixType::B == type) {
+        num_blocks_so_far += val_size;
+        num_std_wells_so_far++;
+    }
+
 #if !HAVE_CUDA && !HAVE_OPENCL
     OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
 #endif
 }
 
-
 void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
 {
     dim = dim_;
@@ -115,7 +218,6 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
     }
 }
 
-#if HAVE_CUDA
 void WellContributions::addNumBlocks(unsigned int numBlocks)
 {
     if (allocated) {
@@ -128,19 +230,36 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
 void WellContributions::alloc()
 {
     if (num_std_wells > 0) {
-        allocStandardWells();
+        val_pointers = new unsigned int[num_std_wells + 1];
+
+#if HAVE_CUDA
+        if(cuda_gpu){
+            allocStandardWells();
+        }
+#endif
+
+#if HAVE_OPENCL
+        if(opencl_gpu){
+            d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+            d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+            d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+            d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+            d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+            d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
+        }
+#endif
         allocated = true;
     }
 }
-#endif
 
 void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
-        unsigned int Nb, unsigned int Mb,
+        unsigned int Nb_, unsigned int Mb,
         unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
         unsigned int DnumBlocks, double *Dvalues, UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
         std::vector<double> &Cvalues)
 {
     assert(dim==dim_);
+    this->Nb = Nb_;
     this->N = Nb * dim_;
     MultisegmentWellContribution *well = new MultisegmentWellContribution(dim_, dim_wells_, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
     multisegments.emplace_back(well);
diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu
index 66e7abbc6..a59ae3ac9 100644
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@@ -127,16 +127,13 @@ __global__ void apply_well_contributions(
 
 void WellContributions::allocStandardWells()
 {
-    if (num_std_wells > 0) {
-        cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-        cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-        cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-        cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-        cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-        val_pointers = new unsigned int[num_std_wells + 1];
-        cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
-        cudaCheckLastError("apply_gpu malloc failed");
-    }
+    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
+    cudaCheckLastError("apply_gpu malloc failed");
 }
 
 void WellContributions::freeCudaMemory() {
@@ -147,14 +144,13 @@ void WellContributions::freeCudaMemory() {
         cudaFree(d_Bnnzs);
         cudaFree(d_Ccols);
         cudaFree(d_Bcols);
-        delete[] val_pointers;
         cudaFree(d_val_pointers);
     }
 
     if (num_ms_wells > 0 && h_x) {
         cudaFreeHost(h_x);
-	cudaFreeHost(h_y);
-	h_x = h_y = nullptr; // Mark as free for constructor
+        cudaFreeHost(h_y);
+        h_x = h_y = nullptr; // Mark as free for constructor
     }
 }
 
@@ -200,10 +196,6 @@ void WellContributions::apply(double *d_x, double *d_y)
 
 void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
 {
-    if (!allocated) {
-        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
-    }
-
     switch (type) {
     case MatrixType::C:
         cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
@@ -218,17 +210,13 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v
         val_pointers[num_std_wells_so_far] = num_blocks_so_far;
         if (num_std_wells_so_far == num_std_wells - 1) {
             val_pointers[num_std_wells] = num_blocks;
+            cudaMemcpy(d_val_pointers, val_pointers, sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
         }
-        cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
         break;
     default:
         OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
     }
     cudaCheckLastError("WellContributions::addMatrix() failed");
-    if (MatrixType::B == type) {
-        num_blocks_so_far += val_size;
-        num_std_wells_so_far++;
-    }
 }
 
 void WellContributions::setCudaStream(cudaStream_t stream_)
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index ac001e01b..7267fe1f0 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -71,30 +71,45 @@ public:
         B
     };
 
-    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
-    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
-    std::vector<MultisegmentWellContribution*> multisegments;
-
-#if HAVE_OPENCL
-    std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
-    std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
-    std::vector<unsigned int> h_val_pointers_ocl;
-#endif
-
 private:
     bool opencl_gpu = false;
     bool cuda_gpu = false;
+    bool allocated = false;
 
     unsigned int N;                          // number of rows (not blockrows) in vectors x and y
-    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-
-#if HAVE_CUDA
-    bool allocated = false;
+    unsigned int Nb;                         // number of blockrows in vectors x and y
+    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
+    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
     unsigned int num_blocks = 0;             // total number of blocks in all wells
     unsigned int num_std_wells = 0;          // number of StandardWells in this object
+    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
     unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
     unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
     unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
+
+    double *h_x = nullptr;
+    double *h_y = nullptr;
+    std::vector<MultisegmentWellContribution*> multisegments;
+
+#if HAVE_OPENCL
+    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                            cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
+                            cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
+
+    cl::Context *context;
+    cl::CommandQueue *queue;
+    kernel_type *kernel;
+    std::vector<cl::Event> events;
+
+    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
+    std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
+    std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
+
+    bool read_toOrder = false;
+    int *h_toOrder = nullptr;
+#endif
+
+#if HAVE_CUDA
     cudaStream_t stream;
 
     // data for StandardWells, could remain nullptrs if not used
@@ -106,8 +121,12 @@ private:
     double *d_z1 = nullptr;
     double *d_z2 = nullptr;
     unsigned int *d_val_pointers = nullptr;
-    double *h_x = nullptr;
-    double *h_y = nullptr;
+
+    /// Allocate GPU memory for StandardWells
+    void allocStandardWells();
+
+    /// Free GPU memory allocated with cuda.
+    void freeCudaMemory();
 
     /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
     /// \param[in] type        indicate if C, D or B is sent
@@ -115,12 +134,6 @@ private:
     /// \param[in] values      array of nonzeroes
     /// \param[in] val_size    number of blocks in C or B, ignored for D
     void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
-
-    /// Allocate GPU memory for StandardWells
-    void allocStandardWells();
-
-    /// Free GPU memory allocated with cuda.
-    void freeCudaMemory();
 #endif
 
 public:
@@ -134,6 +147,15 @@ public:
     /// \param[in] d_x        vector x, must be on GPU
     /// \param[inout] d_y     vector y, must be on GPU
     void apply(double *d_x, double *d_y);
+#endif
+
+#if HAVE_OPENCL
+    void setKernel(kernel_type *kernel_);
+    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
+    void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+    void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+#endif
 
     unsigned int getNumWells(){
         return num_std_wells + num_ms_wells;
@@ -145,7 +167,6 @@ public:
 
     /// Allocate memory for the StandardWells
     void alloc();
-#endif
 
     /// Create a new WellContributions
     WellContributions(std::string gpu_mode);
@@ -158,7 +179,6 @@ public:
     /// \param[in] dim_wells   number of rows
     void setBlockSize(unsigned int dim, unsigned int dim_wells);
 
-
     /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
     /// \param[in] type        indicate if C, D or B is sent
     /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
@@ -188,7 +208,6 @@ public:
                                          UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
                                          std::vector<double> &Cvalues);
 };
-
 } //namespace Opm
 
 #endif
diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
deleted file mode 100644
index da8386528..000000000
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-  Copyright 2019 Equinor ASA
-
-  This file is part of the Open Porous Media project (OPM).
-
-  OPM is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  OPM is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <config.h>
-
-#include <opm/common/OpmLog/OpmLog.hpp>
-#include <opm/common/ErrorMacros.hpp>
-#include <dune/common/timer.hh>
-
-#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
-
-namespace bda
-{
-    using Opm::OpmLog;
-    using Dune::Timer;
-
-    void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int N_, int Nb_){
-        N = N_;
-        Nb = Nb_;
-        dim = wellContribs.dim;
-        dim_wells = wellContribs.dim_wells;
-
-        if(!wellContribs.h_val_pointers_ocl.empty()){
-            num_blocks = wellContribs.h_Ccols_ocl.size();
-            num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
-
-            s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
-            s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
-            s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
-            s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
-            s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
-            s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
-            s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
-        }
-    }
-
-    void WellContributionsOCLContainer::reinit(Opm::WellContributions &wellContribs){
-        num_blocks = wellContribs.h_Ccols_ocl.size();
-        num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
-
-        s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
-        s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
-        s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
-        s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
-        s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
-        s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
-    }
-
-    void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_){
-        toOrder.insert(toOrder.end(), toOrder_, toOrder_ + Nb);
-
-        if(num_std_wells > 0){
-            cl::Event event;
-            std::vector<cl::Event> events(7);
-            queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
-            queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]);
-            queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]);
-            queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]);
-            queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]);
-            queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
-            queue->enqueueWriteBuffer(s.toOrder, CL_FALSE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &events[6]);
-            event.waitForEvents(events);
-        }
-
-        if(!wellContribs.multisegments.empty()){
-            multisegments = std::move(wellContribs.multisegments);
-            num_ms_wells = multisegments.size();
-            x_msw.reserve(N);
-            y_msw.reserve(N);
-        }
-    }
-
-    void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){
-        if(num_std_wells > 0){
-            if(num_std_wells != wellContribs.h_val_pointers_ocl.size() || num_blocks != wellContribs.h_Ccols_ocl.size()){
-                reinit(wellContribs);
-            }
-
-            cl::Event event;
-            std::vector<cl::Event> events(6);
-            queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
-            queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]);
-            queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]);
-            queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]);
-            queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]);
-            queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
-            event.waitForEvents(events);
-        }
-
-        if(!wellContribs.multisegments.empty()){
-            multisegments = std::move(wellContribs.multisegments);
-        }
-    }
-
-    void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){
-        this->context = context_;
-    }
-
-    void WellContributionsOCLContainer::setOpenCLQueue(cl::CommandQueue *queue_){
-        this->queue = queue_;
-    }
-
-    void WellContributionsOCLContainer::setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                                                  const unsigned int, const unsigned int, cl::Buffer&,
-                                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_){
-        this->stdwell_apply = stdwell_apply_;
-    }
-
-    void WellContributionsOCLContainer::applyStdWells(cl::Buffer& x, cl::Buffer& y){
-        const unsigned int work_group_size = 32;
-        const unsigned int total_work_items = num_std_wells * work_group_size;
-        const unsigned int lmem1 = sizeof(double) * work_group_size;
-        const unsigned int lmem2 = sizeof(double) * dim_wells;
-
-        cl::Event event;
-        event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                                 s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, x, y, s.toOrder, dim, dim_wells, s.val_pointers,
-                                 cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
-    }
-
-
-    void WellContributionsOCLContainer::applyMSWells(cl::Buffer& x, cl::Buffer& y) {
-        cl::Event event;
-        std::vector<cl::Event> events(2);
-
-        // copy vectors x and y from GPU to CPU
-        queue->enqueueReadBuffer(x, CL_FALSE, 0, sizeof(double) * N, x_msw.data(), nullptr, &events[0]);
-        queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &events[1]);
-        event.waitForEvents(events);
-
-        // actually apply MultisegmentWells
-        for(auto well: multisegments){
-            well->setReordering(toOrder.data(), true);
-            well->apply(x_msw.data(), y_msw.data());
-        }
-
-        // copy vector y from CPU to GPU
-        queue->enqueueWriteBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &event);
-        event.wait();
-    }
-
-    void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){
-        if(num_std_wells > 0){
-            applyStdWells(x, y);
-        }
-
-        if(num_ms_wells > 0){
-            applyMSWells(x, y);
-        }
-    }
-
-    WellContributionsOCLContainer::~WellContributionsOCLContainer(){
-        if(num_ms_wells > 0){
-            for (auto ms : multisegments) {
-                delete ms;
-            }
-        }
-    }
-} // end namespace bda
diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
deleted file mode 100644
index 08d496141..000000000
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-  Copyright 2019 Equinor ASA
-
-  This file is part of the Open Porous Media project (OPM).
-
-  OPM is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  OPM is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef WELLCONTRIBUTIONSOCLCONTAINER_HPP
-#define WELLCONTRIBUTIONSOCLCONTAINER_HPP
-
-#include <opm/simulators/linalg/bda/opencl.hpp>
-#include <opm/simulators/linalg/bda/WellContributions.hpp>
-#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
-
-namespace bda
-{
-    class WellContributionsOCLContainer
-    {
-    private:
-        int N, Nb;
-        unsigned int dim, dim_wells;
-        unsigned int num_blocks = 0;
-        unsigned int num_std_wells = 0;
-        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-
-        std::vector<int> toOrder;
-        std::vector<double> x_msw, y_msw;
-        std::vector<Opm::MultisegmentWellContribution*> multisegments;
-
-        typedef struct {
-            cl::Buffer Cnnzs, Dnnzs, Bnnzs;
-            cl::Buffer Ccols, Bcols;
-            cl::Buffer val_pointers, toOrder;
-        } GPU_storage;
-
-        GPU_storage s;
-        cl::Context *context;
-        cl::CommandQueue *queue;
-        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                        cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                        const unsigned int, const unsigned int, cl::Buffer&,
-                        cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply;
-
-        void reinit(Opm::WellContributions &wellContribs);
-        void applyStdWells(cl::Buffer& x, cl::Buffer& y);
-        void applyMSWells(cl::Buffer& x, cl::Buffer& y);
-
-    public:
-        WellContributionsOCLContainer() {}
-        ~WellContributionsOCLContainer();
-        WellContributionsOCLContainer(const  WellContributionsOCLContainer&) = delete;
-        WellContributionsOCLContainer& operator=(const  WellContributionsOCLContainer&) = delete;
-        void apply(cl::Buffer& x, cl::Buffer& y);
-        void init(Opm::WellContributions &wellContribs, int N, int Nb);
-        void copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_);
-        void update_on_gpu(Opm::WellContributions &wellContribs);
-        void setOpenCLContext(cl::Context *context);
-        void setOpenCLQueue(cl::CommandQueue *queue);
-        void setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                       cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
-                                       const unsigned int, const unsigned int, cl::Buffer&,
-                                       cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_);
-    };
-} // end namespace bda
-
-#endif
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index 185b946af..d90c9ac6f 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -46,7 +46,142 @@ using Dune::Timer;
 template <unsigned int block_size>
 openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
     prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
-    wcontainer = new WContainer();
+
+    cl_int err = CL_SUCCESS;
+    std::ostringstream out;
+    try {
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        if (platforms.size() == 0) {
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
+        }
+        out << "Found " << platforms.size() << " OpenCL platforms" << "\n";
+
+        if (verbosity >= 1) {
+            std::string platform_info;
+            for (unsigned int i = 0; i < platforms.size(); ++i) {
+                platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info);
+                out << "Platform name      : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info);
+                out << "Platform vendor    : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info);
+                out << "Platform version   : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info);
+                out << "Platform profile   : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
+                out << "Platform extensions: " << platform_info << "\n\n";
+            }
+        }
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+
+        if (platforms.size() <= platformID) {
+            OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
+        } else {
+            std::string platform_info;
+            out << "Chosen:\n";
+            platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
+            out << "Platform name      : " << platform_info << "\n";
+            platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
+            out << "Platform version   : " << platform_info << "\n";
+            OpmLog::info(out.str());
+            out.str("");
+            out.clear();
+        }
+
+        cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0};
+        context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
+
+        devices = context->getInfo<CL_CONTEXT_DEVICES>();
+        if (devices.size() == 0){
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
+        }
+        out << "Found " << devices.size() << " OpenCL devices" << "\n";
+
+        if (verbosity >= 1) {
+            for (unsigned int i = 0; i < devices.size(); ++i) {
+                std::string device_info;
+                std::vector<size_t> work_sizes;
+                std::vector<cl_device_partition_property> partitions;
+
+                devices[i].getInfo(CL_DEVICE_NAME, &device_info);
+                out << "CL_DEVICE_NAME            : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
+                out << "CL_DEVICE_VENDOR          : " << device_info << "\n";
+                devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
+                out << "CL_DRIVER_VERSION         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
+                out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
+                out << "CL_DEVICE_PROFILE         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
+                out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
+                out << "CL_DEVICE_EXTENSIONS      : " << device_info << "\n";
+
+                devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
+                for (unsigned int j = 0; j < work_sizes.size(); ++j) {
+                    out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
+                }
+                devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
+                for (unsigned int j = 0; j < partitions.size(); ++j) {
+                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
+                }
+                partitions.clear();
+                devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
+                for (unsigned int j = 0; j < partitions.size(); ++j) {
+                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
+                }
+
+                // C-style properties
+                cl_device_id tmp_id = devices[i]();
+                cl_ulong size;
+                clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_LOCAL_MEM_SIZE       : " << size / 1024 << " KB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_COMPUTE_UNITS    : " << size << "\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE   : " << size / 1024 / 1024 << " MB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_WORK_GROUP_SIZE  : " << size << "\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n\n";
+            }
+        }
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+
+        if (devices.size() <= deviceID){
+            OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID");
+        } else {
+            std::string device_info;
+            out << "Chosen:\n";
+            devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info);
+            out << "CL_DEVICE_NAME            : " << device_info << "\n";
+            devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info);
+            out << "CL_DEVICE_VERSION         : " << device_info << "\n";
+            OpmLog::info(out.str());
+            out.str("");
+            out.clear();
+        }
+
+        cl::Event event;
+        queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
+       
+    } catch (const cl::Error& error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err());
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (const std::logic_error& error) {
+        // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
+        throw error;
+    }
 }
 
 
@@ -177,11 +312,15 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
 }
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
+void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
     float it;
     double rho, rhop, beta, alpha, omega, tmp1, tmp2;
     double norm, norm_0;
 
+    if(wellContribs.getNumWells() > 0){
+        wellContribs.setKernel(stdwell_apply_k.get());
+    }
+
     Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
 
     // set r to the initial residual
@@ -236,7 +375,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
         // apply wellContributions
         t_well.start();
-        wcontainer->apply(d_pw, d_v);
+        if(wellContribs.getNumWells() > 0){
+            wellContribs.apply(d_pw, d_v, d_toOrder);
+        }
         t_well.stop();
 
         t_rest.start();
@@ -265,7 +406,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
         // apply wellContributions
         t_well.start();
-        wcontainer->apply(d_s, d_t);
+        if(wellContribs.getNumWells() > 0){
+            wellContribs.apply(d_s, d_t, d_toOrder);
+        }
         t_well.stop();
 
         t_rest.start();
@@ -313,7 +456,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs) {
+void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
     this->N = N_;
     this->nnz = nnz_;
     this->nnzb = nnz_ / block_size / block_size;
@@ -327,127 +470,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
     out.str("");
     out.clear();
 
-    cl_int err = CL_SUCCESS;
     try {
-        std::vector<cl::Platform> platforms;
-        cl::Platform::get(&platforms);
-        if (platforms.size() == 0) {
-            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
-        }
-        out << "Found " << platforms.size() << " OpenCL platforms" << "\n";
-
-        if (verbosity >= 1) {
-            std::string platform_info;
-            for (unsigned int i = 0; i < platforms.size(); ++i) {
-                platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info);
-                out << "Platform name      : " << platform_info << "\n";
-                platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info);
-                out << "Platform vendor    : " << platform_info << "\n";
-                platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info);
-                out << "Platform version   : " << platform_info << "\n";
-                platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info);
-                out << "Platform profile   : " << platform_info << "\n";
-                platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
-                out << "Platform extensions: " << platform_info << "\n\n";
-            }
-        }
-        OpmLog::info(out.str());
-        out.str("");
-        out.clear();
-
-        if (platforms.size() <= platformID) {
-            OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
-        } else {
-            std::string platform_info;
-            out << "Chosen:\n";
-            platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
-            out << "Platform name      : " << platform_info << "\n";
-            platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
-            out << "Platform version   : " << platform_info << "\n";
-            OpmLog::info(out.str());
-            out.str("");
-            out.clear();
-        }
-
-        cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0};
-        context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
-
-        std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
-        if (devices.size() == 0){
-            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
-        }
-        out << "Found " << devices.size() << " OpenCL devices" << "\n";
-
-        if (verbosity >= 1) {
-            for (unsigned int i = 0; i < devices.size(); ++i) {
-                std::string device_info;
-                std::vector<size_t> work_sizes;
-                std::vector<cl_device_partition_property> partitions;
-
-                devices[i].getInfo(CL_DEVICE_NAME, &device_info);
-                out << "CL_DEVICE_NAME            : " << device_info << "\n";
-                devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
-                out << "CL_DEVICE_VENDOR          : " << device_info << "\n";
-                devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
-                out << "CL_DRIVER_VERSION         : " << device_info << "\n";
-                devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
-                out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
-                devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
-                out << "CL_DEVICE_PROFILE         : " << device_info << "\n";
-                devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
-                out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
-                devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
-                out << "CL_DEVICE_EXTENSIONS      : " << device_info << "\n";
-
-                devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
-                for (unsigned int j = 0; j < work_sizes.size(); ++j) {
-                    out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
-                }
-                devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
-                for (unsigned int j = 0; j < partitions.size(); ++j) {
-                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
-                }
-                partitions.clear();
-                devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
-                for (unsigned int j = 0; j < partitions.size(); ++j) {
-                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
-                }
-
-                // C-style properties
-                cl_device_id tmp_id = devices[i]();
-                cl_ulong size;
-                clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
-                out << "CL_DEVICE_LOCAL_MEM_SIZE       : " << size / 1024 << " KB\n";
-                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
-                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n";
-                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
-                out << "CL_DEVICE_MAX_COMPUTE_UNITS    : " << size << "\n";
-                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
-                out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE   : " << size / 1024 / 1024 << " MB\n";
-                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
-                out << "CL_DEVICE_MAX_WORK_GROUP_SIZE  : " << size << "\n";
-                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
-                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n\n";
-            }
-        }
-        OpmLog::info(out.str());
-        out.str("");
-        out.clear();
-
-        if (devices.size() <= deviceID){
-            OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID");
-        } else {
-            std::string device_info;
-            out << "Chosen:\n";
-            devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info);
-            out << "CL_DEVICE_NAME            : " << device_info << "\n";
-            devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info);
-            out << "CL_DEVICE_VERSION         : " << device_info << "\n";
-            OpmLog::info(out.str());
-            out.str("");
-            out.clear();
-        }
-
         cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
         source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
         source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
@@ -457,16 +480,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
         source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
         program = cl::Program(*context, source);
-
         program.build(devices);
 
-        cl::Event event;
-        queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
-
         prec->setOpenCLContext(context.get());
         prec->setOpenCLQueue(queue.get());
-        wcontainer->setOpenCLContext(context.get());
-        wcontainer->setOpenCLQueue(queue.get());
 
         rb = new double[N];
         tmp = new double[N];
@@ -491,7 +508,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
         d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
 
-        wcontainer->init(wellContribs, N, Nb);
+        d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
 
         // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
         // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -510,7 +527,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
                                                   cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
 
         prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
-        wcontainer->setKernel(stdwell_apply_k.get());
 
     } catch (const cl::Error& error) {
         std::ostringstream oss;
@@ -534,12 +550,10 @@ void openclSolverBackend<block_size>::finalize() {
     delete[] vals_contiguous;
 #endif
     delete prec;
-    delete wcontainer;
 } // end finalize()
 
-
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &wellContribs) {
+void openclSolverBackend<block_size>::copy_system_to_gpu() {
     Timer t;
     cl::Event event;
 
@@ -558,11 +572,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
     queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
     queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
     queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
-    wcontainer->copy_to_gpu(wellContribs, toOrder);
-
     if (verbosity > 2) {
         std::ostringstream out;
         out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@@ -572,7 +585,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
 
 // don't copy rowpointers and colindices, they stay the same
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system_on_gpu(WellContributions &wellContribs) {
+void openclSolverBackend<block_size>::update_system_on_gpu() {
     Timer t;
     cl::Event event;
 
@@ -592,8 +605,6 @@ void openclSolverBackend<block_size>::update_system_on_gpu(WellContributions &we
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
-    wcontainer->update_on_gpu(wellContribs);
-
     if (verbosity > 2) {
         std::ostringstream out;
         out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s";
@@ -660,12 +671,12 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::solve_system(BdaResult &res) {
+void openclSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
     Timer t;
 
     // actually solve
     try {
-        gpu_pbicgstab(res);
+        gpu_pbicgstab(wellContribs, res);
     } catch (const cl::Error& error) {
         std::ostringstream oss;
         oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
@@ -706,7 +717,7 @@ void openclSolverBackend<block_size>::get_result(double *x) {
 template <unsigned int block_size>
 SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
     if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols, wellContribs);
+        initialize(N_, nnz_,  dim, vals, rows, cols);
         if (analysis_done == false) {
             if (!analyse_matrix()) {
                 return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@@ -716,15 +727,15 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
-        copy_system_to_gpu(wellContribs);
+        copy_system_to_gpu();
     } else {
         update_system(vals, b);
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
-        update_system_on_gpu(wellContribs);
+        update_system_on_gpu();
     }
-    solve_system(res);
+    solve_system(wellContribs, res);
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
 
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index 6981c2801..1c36fefcb 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -25,9 +25,10 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
-#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
 #include <opm/simulators/linalg/bda/BILU0.hpp>
 
+#include <tuple>
+
 namespace bda
 {
 
@@ -37,7 +38,6 @@ class openclSolverBackend : public BdaSolver<block_size>
 {
     typedef BdaSolver<block_size> Base;
     typedef BILU0<block_size> Preconditioner;
-    typedef WellContributionsOCLContainer WContainer;
 
     using Base::N;
     using Base::Nb;
@@ -54,19 +54,17 @@ private:
     double *rb = nullptr;                 // reordered b vector, the matrix is reordered, so b must also be
     double *vals_contiguous = nullptr;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
 
-    bool analysis_done = false;
-
     // OpenCL variables must be reusable, they are initialized in initialize()
     cl::Buffer d_Avals, d_Acols, d_Arows;        // (reordered) matrix in BSR format on GPU
     cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p;   // vectors, used during linear solve
     cl::Buffer d_pw, d_s, d_t, d_v;              // vectors, used during linear solve
     cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
+    cl::Buffer d_toOrder;
     double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()
 
     // shared pointers are also passed to other objects
+    std::vector<cl::Device> devices;
     cl::Program program;
-    std::shared_ptr<cl::Context> context;
-    std::shared_ptr<cl::CommandQueue> queue;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
     std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
@@ -80,8 +78,8 @@ private:
                                     cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
 
     Preconditioner *prec = nullptr;                               // only supported preconditioner is BILU0
-    WContainer *wcontainer = nullptr;
     int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
+    bool analysis_done = false;
     std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
     BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix, used for spmv
 
@@ -133,7 +131,7 @@ private:
     /// Solve linear system using ilu0-bicgstab
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(BdaResult& res);
+    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
     /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
@@ -142,13 +140,13 @@ private:
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
     /// \param[in] rows           array of rowPointers, contains N/dim+1 values
     /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs);
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
 
     /// Clean memory
     void finalize();
 
     /// Copy linear system to GPU
-    void copy_system_to_gpu(WellContributions &wellContribs);
+    void copy_system_to_gpu();
 
     /// Reorder the linear system so it corresponds with the coloring
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@@ -156,7 +154,7 @@ private:
     void update_system(double *vals, double *b);
 
     /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
-    void update_system_on_gpu(WellContributions &wellContribs);
+    void update_system_on_gpu();
 
     /// Analyse sparsity pattern to extract parallelism
     /// \return true iff analysis was successful
@@ -169,9 +167,11 @@ private:
     /// Solve linear system
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void solve_system(BdaResult &res);
+    void solve_system(WellContributions &wellContribs, BdaResult &res);
 
 public:
+    std::shared_ptr<cl::Context> context;
+    std::shared_ptr<cl::CommandQueue> queue;
 
     /// Construct a openclSolver
     /// \param[in] linear_solver_verbosity    verbosity of openclSolver
diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp
index ab0a94f20..089895721 100644
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@@ -950,7 +950,6 @@ namespace Opm {
         // prepare for StandardWells
         wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
 
-#if HAVE_CUDA
         for(unsigned int i = 0; i < well_container_.size(); i++){
             auto& well = well_container_[i];
             std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
@@ -963,7 +962,6 @@ namespace Opm {
 
         // allocate memory for data from StandardWells
         wellContribs.alloc();
-#endif
 
         for(unsigned int i = 0; i < well_container_.size(); i++){
             auto& well = well_container_[i];

From 09e38a38e944a7f807e853bb0a67bff061d611a7 Mon Sep 17 00:00:00 2001
From: Jose Eduardo Bueno <ducbueno@gmail.com>
Date: Thu, 19 Nov 2020 15:41:59 -0300
Subject: [PATCH 2/2] Fixed CMakeLists_files.cmake

---
 CMakeLists_files.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 852bbe082..f9312ede4 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -171,7 +171,6 @@ list (APPEND PUBLIC_HEADER_FILES
   opm/simulators/linalg/bda/openclSolverBackend.hpp
   opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
   opm/simulators/linalg/bda/WellContributions.hpp
-  opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
   opm/simulators/linalg/amgcpr.hh
   opm/simulators/linalg/twolevelmethodcpr.hh
   opm/simulators/linalg/ExtractParallelGridInformationToISTL.hpp