diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index 6f8b856b9..60142f014 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -63,22 +63,23 @@ public:
         B
     };
 
+    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
+    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
+
 #if HAVE_OPENCL
     std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
     std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
     std::vector<unsigned int> h_val_pointers_ocl;
     std::vector<double> h_x_ocl, h_y_ocl;
-#endif
-
-private:
-    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
-    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
-    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
-    std::vector<MultisegmentWellContribution*> multisegments;
 
     int *toOrder = nullptr;
     bool reorder = false;
+#endif
+
+private:
+    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
+    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
+    std::vector<MultisegmentWellContribution*> multisegments;
 
     bool opencl_gpu = false;
     bool cuda_gpu = false;
diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
index 716d05e57..1c9e1d504 100644
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp
@@ -24,19 +24,18 @@
 #include <dune/common/timer.hh>
 
 #include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
-
+#include<iostream>
 
 namespace bda
 {
     using Opm::OpmLog;
     using Dune::Timer;
 
-    void WellContributionsOCLContainer::initBuffers(WellContributions &wellContribs)
-    {
+    void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int Nb_){
+        Nb = Nb_;
         dim = wellContribs.dim;
         dim_wells = wellContribs.dim_wells;
         num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
-        toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + wellContribs.h_Ccols_ocl.size());
 
         s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
         s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
@@ -44,10 +43,12 @@ namespace bda
         s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
         s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
         s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
-        s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * toOrder.size());
+        s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
     }
 
-    void WellcontributionsOCLContainer::copy_to_gpu(WellContributions &wellContribs){
+    void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs){
+        toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + Nb);
+
         cl::Event event;
         queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data());
         queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data());
@@ -59,7 +60,7 @@ namespace bda
         event.wait();
     }
 
-    void WellcontributionsOCLContainer::update_on_gpu(WellContributions &wellContribs){
+    void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){
         cl::Event event;
         queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data());
         queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data());
@@ -90,7 +91,7 @@ namespace bda
 
         cl::Event event;
         event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                                 s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, s.toOrder,x, y, dim, dim_wells, s.val_pointers,
+                                 s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, x, y, s.toOrder, dim, dim_wells, s.val_pointers,
                                  cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
     }
 
diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
index 70e85fb90..3f6182a20 100644
--- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
+++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
@@ -31,6 +31,7 @@ namespace bda
         unsigned int dim, dim_wells;
         unsigned int num_std_wells = 0;
         unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
+        int Nb;
         std::vector<int> toOrder;
 
         typedef struct {
@@ -50,16 +51,15 @@ namespace bda
         void applyStdWells(cl::Buffer& x, cl::Buffer& y);
 
     public:
-        WellContributionsOCLContainer();
-        ~WellContributionsOCLContainer();
+        WellContributionsOCLContainer() {};
+        ~WellContributionsOCLContainer() {};
 
         void apply(cl::Buffer& x, cl::Buffer& y);
-        void initBuffers(WellContributions &wellContribs);
-        void copy_to_gpu(WellContributions &wellContribs);
-        void update_on_gpu(WellContributions &wellContribs);
+        void init(Opm::WellContributions &wellContribs, int Nb);
+        void copy_to_gpu(Opm::WellContributions &wellContribs);
+        void update_on_gpu(Opm::WellContributions &wellContribs);
         void setOpenCLContext(cl::Context *context);
         void setOpenCLQueue(cl::CommandQueue *queue);
-        void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
         void setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                        cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                        const unsigned int, const unsigned int, cl::Buffer&,
diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp
index d7364c9ec..c202147d1 100644
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -436,7 +436,7 @@ namespace bda
             for (unsigned int j = 0; j < dim_wells; ++j){
                 temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
             }
-            colIdx = toOrder[Ccols[bb]];
+            int colIdx = toOrder[Ccols[bb]];
             y[colIdx*dim + c] -= temp;
         }
     }
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index 50a549faf..265cbd0a8 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -236,12 +236,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
 
         // v = A * pw
         t_spmv.start();
-        wcontainer->apply(d_pw, d_v);
+        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
         t_spmv.stop();
 
         // apply wellContributions
         t_well.start();
-        stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_pw, d_v, d_val_pointers);
+        wcontainer->apply(d_pw, d_v);
         t_well.stop();
 
         t_rest.start();
@@ -496,7 +496,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
         d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
 
-        wcontainer->initBuffers(wellContribs);
+        wcontainer->init(wellContribs, Nb);
 
         // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
         // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -566,6 +566,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
+    wellContribs.setReordering(toOrder, true);
     wcontainer->copy_to_gpu(wellContribs);
 
     if (verbosity > 2) {
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index ec103f725..e01bd4de1 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -129,8 +129,6 @@ private:
     /// \param[out] b       output vector
     void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
 
-    void stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers);
-
     /// Solve linear system using ilu0-bicgstab
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
@@ -170,7 +168,7 @@ private:
     /// Solve linear system
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions &wellContribs, BdaResult &res);
+    void solve_system(BdaResult &res);
 
 public: