From 56c1c0862cd77db6c89cd98c9d6220eb7468850d Mon Sep 17 00:00:00 2001
From: Jose Eduardo Bueno <ducbueno@gmail.com>
Date: Fri, 17 Jul 2020 17:00:37 -0300
Subject: [PATCH] Added StandardWell functionality to OpenCL backend

---
 opm/simulators/linalg/ISTLSolverEbos.hpp      |  13 +-
 .../linalg/bda/WellContributions.cpp          | 198 ++++++++++++++----
 .../linalg/bda/WellContributions.hpp          |  86 ++++----
 opm/simulators/linalg/bda/openclKernels.hpp   | 106 +++++++++-
 .../linalg/bda/openclSolverBackend.cpp        |  46 ++--
 .../linalg/bda/openclSolverBackend.hpp        |  14 +-
 6 files changed, 350 insertions(+), 113 deletions(-)

diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp
index dc324e583..e05dbccec 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -181,7 +181,7 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
 #else
             const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
             if (gpu_mode.compare("none") != 0) {
-                OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL was not found by cmake");
+                OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake");
             }
 #endif
             extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
@@ -465,7 +465,8 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
 #if HAVE_CUDA || HAVE_OPENCL
                 bool use_gpu = bdaBridge->getUseGpu();
                 if (use_gpu) {
-                    WellContributions wellContribs;
+                    const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+                    WellContributions wellContribs(gpu_mode);
                     if (!useWellConn_) {
                         simulator_.problem().wellModel().getWellContributions(wellContribs);
                     }
@@ -478,7 +479,13 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
                         // CPU fallback
                         use_gpu = bdaBridge->getUseGpu();  // update value, BdaBridge might have disabled cusparseSolver
                         if (use_gpu) {
-                            OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
+                            if(gpu_mode.compare("cusparse") == 0){
+                                OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
+                            }
+
+                            if(gpu_mode.compare("opencl") == 0){
+                                OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
+                            }
                         }
 
                         // call Dune
diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index 1533003de..a46f54c3c 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -24,79 +24,202 @@
 
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
 #include "opm/simulators/linalg/bda/WellContributions.hpp"
 
 namespace Opm
 {
 
+WellContributions::WellContributions(std::string gpu_mode){
+    if(gpu_mode.compare("cusparse") == 0){
+        cuda_gpu = true;
+    }
+
+    if(gpu_mode.compare("opencl") == 0){
+        opencl_gpu = true;
+    }
+}
 
 void WellContributions::alloc()
 {
     if (num_std_wells > 0) {
 #if HAVE_CUDA
-        allocStandardWells();
-#else
-        OPM_THROW(std::logic_error, "Error cannot allocate on GPU for StandardWells because CUDA was not found by cmake");
+        if(cuda_gpu){
+            allocStandardWells();
+        }
+#endif
+
+#if HAVE_OPENCL
+        if(opencl_gpu){
+            h_Cnnzs_ocl = new double[num_blocks * dim * dim_wells];
+            h_Dnnzs_ocl = new double[num_std_wells * dim_wells * dim_wells];
+            h_Bnnzs_ocl = new double[num_blocks * dim * dim_wells];
+            h_Ccols_ocl = new int[num_blocks];
+            h_Bcols_ocl = new int[num_blocks];
+            val_pointers = new unsigned int[num_std_wells + 1];
+
+            allocated = true;
+        }
+#endif
+
+#if !HAVE_CUDA && !HAVE_OPENCL
+        OPM_THROW(std::logic_error, "Error cannot allocate on GPU because neither CUDA nor OpenCL were found by cmake");
 #endif
     }
 }
 
 WellContributions::~WellContributions()
 {
-#if HAVE_CUDA
-    freeCudaMemory();
-#endif
-    if (h_x) {
-        delete[] h_x;
-        delete[] h_y;
-    }
-
     // delete MultisegmentWellContributions
     for (auto ms : multisegments) {
         delete ms;
     }
     multisegments.clear();
-}
 
+#if HAVE_CUDA
+    if(cuda_gpu){
+        freeCudaMemory(); // should come before 'delete[] h_x'
+    }
+#endif
 
 #if HAVE_OPENCL
-void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
+    if (h_x_ocl) {
+        delete[] h_x_ocl;
+        delete[] h_y_ocl;
+    }
 
+    if(opencl_gpu){
+        if(num_std_wells > 0){
+            delete[] h_Cnnzs_ocl;
+            delete[] h_Dnnzs_ocl;
+            delete[] h_Bnnzs_ocl;
+            delete[] h_Ccols_ocl;
+            delete[] h_Bcols_ocl;
+            delete[] val_pointers;
+        }
+    }
+#endif
+}
+
+#if HAVE_OPENCL
+
+void WellContributions::init(cl::Context *context){
+    d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+    d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+    d_Bnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+    d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+    d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+    d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
+}
+
+void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
+    cl::Event event;
+
+    queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
+    queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
+    queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
+    queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
+    queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
+    queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
+    event.wait();
+}
+
+void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
     // apply MultisegmentWells
     if (num_ms_wells > 0) {
         // allocate pinned memory on host if not yet done
-        if (h_x == nullptr) {
-            h_x = new double[N];
-            h_y = new double[N];
+        if (h_x_ocl == nullptr) {
+            h_x_ocl = new double[N];
+            h_y_ocl = new double[N];
         }
 
         // copy vectors x and y from GPU to CPU
-        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
-        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl);
+        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
 
         // actually apply MultisegmentWells
         for (MultisegmentWellContribution *well : multisegments) {
-            well->apply(h_x, h_y);
+            well->apply(h_x_ocl, h_y_ocl);
         }
 
         // copy vector y from CPU to GPU
-        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
-    }
-
-
-    // apply StandardWells
-    if (num_std_wells > 0) {
-        OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
+        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
     }
 }
+
+void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
+    const unsigned int work_group_size = 32;
+    const unsigned int total_work_items = num_std_wells * work_group_size;
+    const unsigned int lmem1 = sizeof(double) * work_group_size;
+    const unsigned int lmem2 = sizeof(double) * dim_wells;
+
+    cl::Event event;
+    event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
+                      d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
+                      d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+    event.wait();
+}
+
+void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
+    if(num_std_wells > 0){
+        applyStdWell(queue, d_x, d_y, kernel);
+    }
+
+    if(num_ms_wells > 0){
+        applyMSWell(queue, d_x, d_y);
+    }
+}
+
 #endif
 
-void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]]int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
+void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
 {
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
+    }
+
 #if HAVE_CUDA
+    if(cuda_gpu){
         addMatrixGpu(type, colIndices, values, val_size);
-#else
-        OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because CUDA was not found by cmake");
+    }
+#endif
+
+#if HAVE_OPENCL
+    if(opencl_gpu){
+        switch (type) {
+            case MatrixType::C:
+                std::copy(colIndices, colIndices + val_size, h_Ccols_ocl + num_blocks_so_far);
+                std::copy(values, values + val_size*dim*dim_wells, h_Cnnzs_ocl + num_blocks_so_far*dim*dim_wells);
+                break;
+
+            case MatrixType::D:
+                std::copy(values, values + dim_wells*dim_wells, h_Dnnzs_ocl + num_std_wells_so_far*dim_wells*dim_wells);
+                break;
+
+            case MatrixType::B:
+                std::copy(colIndices, colIndices + val_size, h_Bcols_ocl + num_blocks_so_far);
+                std::copy(values, values + val_size*dim*dim_wells, h_Bnnzs_ocl + num_blocks_so_far*dim*dim_wells);
+                val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+
+                if(num_std_wells_so_far == num_std_wells - 1){
+                    val_pointers[num_std_wells] = num_blocks;
+                }
+                break;
+
+            default:
+                OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
+        }
+
+        if (MatrixType::B == type) {
+            num_blocks_so_far += val_size;
+            num_std_wells_so_far++;
+        }
+    }
+
+#endif
+
+#if !HAVE_CUDA && !HAVE_OPENCL
+    OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
 #endif
 }
 
@@ -107,23 +230,23 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
     dim_wells = dim_wells_;
 }
 
-void WellContributions::addNumBlocks(unsigned int nnz)
+void WellContributions::addNumBlocks(unsigned int numBlocks)
 {
     if (allocated) {
         OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
     }
-    num_blocks += nnz;
+    num_blocks += numBlocks;
     num_std_wells++;
 }
 
-void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
+void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
         unsigned int Nb, unsigned int Mb,
         unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
         unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
         std::vector<double> &Cvalues)
 {
-    this->N = Nb * dim_;
-    MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells_, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
+    this->N = Nb * dim;
+    MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
     multisegments.emplace_back(well);
     ++num_ms_wells;
 }
@@ -138,12 +261,5 @@ void WellContributions::setReordering(int *toOrder_, bool reorder_)
     }
 }
 
-#if HAVE_OPENCL
-void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_)
-{
-    this->queue = queue_;
-}
-#endif
-
 } //namespace Opm
 
diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp
index cd8b9064e..a8c6b1ed7 100644
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@@ -57,7 +57,6 @@ class WellContributions
 {
 
 public:
-
     /// StandardWell has C, D and B matrices that need to be copied
     enum class MatrixType {
         C,
@@ -78,17 +77,16 @@ private:
     bool allocated = false;
     std::vector<MultisegmentWellContribution*> multisegments;
 
+    int *toOrder = nullptr;
+    bool reorder = false;
+
+    bool opencl_gpu = false;
+    bool cuda_gpu = false;
+
 #if HAVE_CUDA
     cudaStream_t stream;
-#endif
 
-#if HAVE_OPENCL
-    cl::CommandQueue *queue = nullptr;
-#endif
-
-#if HAVE_CUDA
     // data for StandardWells, could remain nullptrs if not used
-    // StandardWells are only supported for cusparseSolver now
     double *d_Cnnzs = nullptr;
     double *d_Dnnzs = nullptr;
     double *d_Bnnzs = nullptr;
@@ -97,13 +95,29 @@ private:
     double *d_z1 = nullptr;
     double *d_z2 = nullptr;
     unsigned int *d_val_pointers = nullptr;
+    double *h_x = nullptr;
+    double *h_y = nullptr;
 #endif
 
-    double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+#if HAVE_OPENCL
+    double *h_Cnnzs_ocl = nullptr;
+    double *h_Dnnzs_ocl = nullptr;
+    double *h_Bnnzs_ocl = nullptr;
+    int *h_Ccols_ocl = nullptr;
+    int *h_Bcols_ocl = nullptr;
+    double *h_x_ocl = nullptr;
+    double *h_y_ocl = nullptr;
 
-    int *toOrder = nullptr;
-    bool reorder = false;
+    cl::Buffer d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
+    cl::Buffer d_Ccols_ocl, d_Bcols_ocl, d_val_pointers_ocl;
 
+    typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                            cl::Buffer&, cl::Buffer&, cl::Buffer&,
+                            cl::Buffer&, const unsigned int, const unsigned int,
+                            cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
+#endif
+
+#if HAVE_CUDA
     /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
     /// \param[in] type        indicate if C, D or B is sent
     /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
@@ -116,32 +130,43 @@ private:
 
     /// Free GPU memory allocated with cuda.
     void freeCudaMemory();
-
-public:
-
-    /// Set a cudaStream to be used
-    /// \param[in] stream           the cudaStream that is used to launch the kernel in
-#if HAVE_CUDA
-    void setCudaStream(cudaStream_t stream);
 #endif
 
-    /// Create a new WellContributions, implementation is empty
-    WellContributions() {};
+#if HAVE_OPENCL
+    void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
+    void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
+#endif
 
-    /// Destroy a WellContributions, and free memory
-    ~WellContributions();
+public:
+#if HAVE_CUDA
+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used to launch the kernel in
+    void setCudaStream(cudaStream_t stream);
 
     /// Apply all Wells in this object
     /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
     /// \param[in] d_x        vector x, must be on GPU
     /// \param[inout] d_y     vector y, must be on GPU
-#if HAVE_CUDA
     void apply(double *d_x, double *d_y);
+
+    unsigned int getNumWells(){
+        return num_std_wells + num_ms_wells;
+    }
 #endif
+
 #if HAVE_OPENCL
-    void apply(cl::Buffer& x, cl::Buffer& y);
+    void init(cl::Context *context);
+    void copyDataToGPU(cl::CommandQueue *queue);
+    void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
 #endif
 
+    /// Create a new WellContributions
+    WellContributions(std::string gpu_mode);
+
+    /// Destroy a WellContributions, and free memory
+    ~WellContributions();
+
+
     /// Allocate memory for the StandardWells
     void alloc();
 
@@ -182,24 +207,11 @@ public:
                                          unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
                                          std::vector<double> &Cvalues);
 
-    /// Return the number of wells added to this object
-    /// \return the number of wells added to this object
-    unsigned int getNumWells() {
-        return num_std_wells + num_ms_wells;
-    }
-
-
     /// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
     /// Those indices need to be mapped via toOrder
     /// \param[in] toOrder    array with mappings
     /// \param[in] reorder    whether the columnindices need to be reordered or not
     void setReordering(int *toOrder, bool reorder);
-
-#if HAVE_OPENCL
-    /// This object copies some cl::Buffers, it requires a CommandQueue to do that
-    /// \param[in] queue      the opencl commandqueue to be used
-    void setOpenCLQueue(cl::CommandQueue *queue);
-#endif
 };
 
 } //namespace Opm
diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp
index 17d8598f7..30dbfcd63 100644
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -23,7 +23,7 @@
 namespace bda
 {
 
-    const char* axpy_s  = R"(
+    inline const char* axpy_s  = R"(
     __kernel void axpy(
         __global double *in,
         const double a,
@@ -42,7 +42,7 @@ namespace bda
 
 
     // returns partial sums, instead of the final dot product
-    const char* dot_1_s = R"(
+    inline const char* dot_1_s = R"(
     __kernel void dot_1(
         __global double *in1,
         __global double *in2,
@@ -83,7 +83,7 @@ namespace bda
 
     // returns partial sums, instead of the final norm
     // the square root must be computed on CPU
-    const char* norm_s = R"(
+    inline const char* norm_s = R"(
     __kernel void norm(
         __global double *in,
         __global double *out,
@@ -122,7 +122,7 @@ namespace bda
 
 
     // p = (p - omega * v) * beta + r
-    const char* custom_s = R"(
+    inline const char* custom_s = R"(
     __kernel void custom(
         __global double *p,
         __global double *v,
@@ -150,7 +150,7 @@ namespace bda
     // algorithm based on:
     // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
     // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
-    const char* spmv_blocked_s = R"(
+    inline const char* spmv_blocked_s = R"(
     __kernel void spmv_blocked(
         __global const double *vals,
         __global const int *cols,
@@ -220,7 +220,7 @@ namespace bda
 
     // ILU apply part 1: forward substitution
     // solves L*x=y where L is a lower triangular sparse blocked matrix
-    const char* ILU_apply1_s = R"(
+    inline const char* ILU_apply1_s = R"(
     __kernel void ILU_apply1(
         __global const double *Lvals,
         __global const unsigned int *Lcols,
@@ -290,7 +290,7 @@ namespace bda
 
     // ILU apply part 2: backward substitution
     // solves U*x=y where L is a lower triangular sparse blocked matrix
-    const char* ILU_apply2_s = R"(
+    inline const char* ILU_apply2_s = R"(
     __kernel void ILU_apply2(
         __global const double *Uvals,
         __global const int *Ucols,
@@ -366,7 +366,99 @@ namespace bda
     }
     )";
 
+    inline const char* add_well_contributions_s = R"(
+    #pragma OPENCL EXTENSION cl_khr_fp64: enable
+    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 
+    void atomicAdd(volatile __global double *val, const double delta){
+        union{
+            double f;
+            ulong i;
+        } old;
+
+        union{
+            double f;
+            ulong i;
+        } new;
+
+        do{
+            old.f = *val;
+            new.f = old.f + delta;
+        } while(atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i);
+    }
+
+    __kernel void add_well_contributions(__global const double *valsC,
+                                         __global const double *valsD,
+                                         __global const double *valsB,
+                                         __global const int *colsC,
+                                         __global const int *colsB,
+                                         __global const double *x,
+                                         __global double *y,
+                                         const unsigned int blnc,
+                                         const unsigned int blnr,
+                                         __global const unsigned int *rowptr,
+                                         __local double *localSum,
+                                         __local double *z1,
+                                         __local double *z2){
+        int wgId = get_group_id(0);
+        int wiId = get_local_id(0);
+        int valSize = rowptr[wgId + 1] - rowptr[wgId];
+        int valsPerBlock = blnc*blnr;
+        int numActiveWorkItems = (32/valsPerBlock)*valsPerBlock;
+        int numBlocksPerWarp = 32/valsPerBlock;
+        int c = wiId % blnc;
+        int r = (wiId/blnc) % blnr;
+        double temp;
+
+        localSum[wiId] = 0;
+        if(wiId < numActiveWorkItems){
+            int b = wiId/valsPerBlock + rowptr[wgId];
+            while(b < valSize + rowptr[wgId]){
+                int colIdx = colsB[b];
+                localSum[wiId] += valsB[b*blnc*blnr + r*blnc + c]*x[colIdx*blnc + c];
+                b += numBlocksPerWarp;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int stride = valsPerBlock;
+        if(wiId < stride){
+            localSum[wiId] += localSum[wiId + stride];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if(c == 0 && wiId < valsPerBlock){
+            for(stride = 2; stride > 0; stride /= 2){
+                localSum[wiId] += localSum[wiId + stride];
+            }
+            z1[r] = localSum[wiId];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if(wiId < blnr){
+            temp = 0.0;
+            for(unsigned int i = 0; i < blnr; ++i){
+                temp += valsD[wgId*blnr*blnr + wiId*blnr + i]*z1[i];
+            }
+            z2[wiId] = temp;
+        }
+
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if(wiId < blnc*valSize){
+            temp = 0.0;
+            int bb = wiId/blnc + rowptr[wgId];
+            int colIdx = colsC[bb];
+            for (unsigned int j = 0; j < blnr; ++j){
+                temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j];
+            }
+            atomicAdd(&y[colIdx*blnc + c], temp);
+        }
+    }
+    )";
 
 } // end namespace bda
 
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp
index 5bbf4bbe8..8124d9885 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@@ -20,6 +20,7 @@
 #include <config.h>
 #include <cmath>
 #include <sstream>
+#include <iostream>
 
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
@@ -70,7 +71,7 @@ unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A,
 template <unsigned int block_size>
 double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
 {
-    const unsigned int work_group_size = 1024;
+    const unsigned int work_group_size = 256;
     const unsigned int num_work_groups = ceilDivision(N, work_group_size);
     const unsigned int total_work_items = num_work_groups * work_group_size;
     const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
@@ -98,7 +99,7 @@ double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl
 template <unsigned int block_size>
 double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
 {
-    const unsigned int work_group_size = 1024;
+    const unsigned int work_group_size = 256;
     const unsigned int num_work_groups = ceilDivision(N, work_group_size);
     const unsigned int total_work_items = num_work_groups * work_group_size;
     const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
@@ -179,19 +180,14 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
     }
 }
 
-
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
-
     float it;
     double rho, rhop, beta, alpha, omega, tmp1, tmp2;
     double norm, norm_0;
 
     Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
 
-    wellContribs.setOpenCLQueue(queue.get());
-    wellContribs.setReordering(toOrder, true);
-
     // set r to the initial residual
     // if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
     //applyblockedscaleadd(-1.0, mat, x, r);
@@ -212,6 +208,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
     queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
+    wellContribs.setReordering(toOrder, true);
+
     norm = norm_w(d_r, d_tmp);
     norm_0 = norm;
 
@@ -243,11 +241,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
         t_spmv.stop();
 
         // apply wellContributions
-        if (wellContribs.getNumWells() > 0) {
-            t_well.start();
-            wellContribs.apply(d_pw, d_v);
-            t_well.stop();
-        }
+        t_well.start();
+        wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
+        t_well.stop();
 
         t_rest.start();
         tmp1 = dot_w(d_rw, d_v, d_tmp);
@@ -274,11 +270,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
         t_spmv.stop();
 
         // apply wellContributions
-        if (wellContribs.getNumWells() > 0) {
-            t_well.start();
-            wellContribs.apply(d_s, d_t);
-            t_well.stop();
-        }
+        t_well.start();
+        wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
+        t_well.stop();
 
         t_rest.start();
         tmp1 = dot_w(d_t, d_r, d_tmp);
@@ -314,7 +308,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
     }
     if (verbosity >= 4) {
         std::ostringstream out;
-        out << "openclSolver::ily_apply:   " << t_prec.elapsed() << " s\n";
+        out << "openclSolver::ilu_apply:   " << t_prec.elapsed() << " s\n";
+        out << "wellContributions::apply:  " << t_well.elapsed() << " s\n";
         out << "openclSolver::spmv:        " << t_spmv.elapsed() << " s\n";
         out << "openclSolver::rest:        " << t_rest.elapsed() << " s\n";
         out << "openclSolver::total_solve: " << res.elapsed << " s\n";
@@ -324,7 +319,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
+void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs) {
     this->N = N_;
     this->nnz = nnz_;
     this->nnzb = nnz_ / block_size / block_size;
@@ -466,6 +461,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
         source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
         source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
+        source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
         cl::Program program_ = cl::Program(*context, source);
 
         program_.build(devices);
@@ -481,7 +477,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
 #if COPY_ROW_BY_ROW
         vals_contiguous = new double[N];
 #endif
-
         mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
 
         d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
@@ -500,6 +495,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
         d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
 
+        wellContribs.init(context.get());
+
         // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
         // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
 
@@ -511,6 +508,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
         spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
         ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
         ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
+        add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));
 
         prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
 
@@ -541,7 +539,7 @@ void openclSolverBackend<block_size>::finalize() {
 
 
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu() {
+void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
     Timer t;
     cl::Event event;
 
@@ -563,6 +561,8 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
     queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
     event.wait();
 
+    wellContribs.copyDataToGPU(queue.get());
+
     if (verbosity > 2) {
         std::ostringstream out;
         out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@@ -706,7 +706,7 @@ void openclSolverBackend<block_size>::get_result(double *x) {
 template <unsigned int block_size>
 SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
     if (initialized == false) {
-        initialize(N_, nnz_,  dim, vals, rows, cols);
+        initialize(N_, nnz_,  dim, vals, rows, cols, wellContribs);
         if (analysis_done == false) {
             if (!analyse_matrix()) {
                 return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@@ -716,7 +716,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
         if (!create_preconditioner()) {
             return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
         }
-        copy_system_to_gpu();
+        copy_system_to_gpu(wellContribs);
     } else {
         update_system(vals, b);
         if (!create_preconditioner()) {
diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp
index 011f25d13..0cbbf937e 100644
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@@ -64,6 +64,13 @@ private:
     cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
     double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()
 
+    //unsigned int num_blocks, dim_, dim_wells, num_std_wells;
+    //unsigned int *h_val_pointers;
+    //int *h_Ccols, *h_Bcols;
+    //double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs;
+    //cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs;
+    //cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
+
     // shared pointers are also passed to other objects
     std::shared_ptr<cl::Context> context;
     std::shared_ptr<cl::CommandQueue> queue;
@@ -74,6 +81,7 @@ private:
     std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
     std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
     std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > add_well_contributions_k;
 
     Preconditioner *prec = nullptr;                  // only supported preconditioner is BILU0
     int *toOrder = nullptr, *fromOrder = nullptr;    // BILU0 reorders rows of the matrix via these mappings
@@ -127,6 +135,8 @@ private:
     /// \param[out] b       output vector
     void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
 
+    //void add_well_contributions_w(cl::Buffer valsC, cl::Buffer valsD, cl::Buffer valsB, cl::Buffer colsC, cl::Buffer colsB, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers);
+
     /// Solve linear system using ilu0-bicgstab
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
@@ -139,13 +149,13 @@ private:
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
     /// \param[in] rows           array of rowPointers, contains N/dim+1 values
     /// \param[in] cols           array of columnIndices, contains nnz values
-    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs);
 
     /// Clean memory
     void finalize();
 
     /// Copy linear system to GPU
-    void copy_system_to_gpu();
+    void copy_system_to_gpu(WellContributions& wellContribs);
 
     /// Reorder the linear system so it corresponds with the coloring
     /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values