changed: refactor BDA well contributions

split in API specific classes for Cuda/OpenCL this to 1) it's cleaner 2) it avoids pulling in openCL code in cuda classes which leads to clashes between nvidia headers and opencl.hpp there is still too much API specific things in interface between the bda components to work through a virtual interface so we still have to cast to the relevant implementation in various places.
2025-02-25 18:55:30 -06:00 · 2021-11-11 14:45:12 +01:00 · 2021-11-11 14:45:12 +01:00 · e25caba8ed
commit e25caba8ed
parent 352d31a1e9
13 changed files with 388 additions and 279 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -49,6 +49,7 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/linalg/FlexibleSolver6.cpp
  opm/simulators/linalg/PropertyTree.cpp
  opm/simulators/linalg/setupPropertyTree.cpp
+  opm/simulators/linalg/bda/WellContributions.cpp
  opm/simulators/utils/PartiallySupportedFlowKeywords.cpp
  opm/simulators/utils/readDeck.cpp
  opm/simulators/utils/UnsupportedFlowKeywords.cpp
@ -93,8 +94,7 @@ list (APPEND MAIN_SOURCE_FILES

 if(CUDA_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuWellContributions.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
@ -106,6 +106,7 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
@ -265,6 +266,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/opencl.hpp
  opm/simulators/linalg/bda/openclKernels.hpp
  opm/simulators/linalg/bda/openclSolverBackend.hpp
+  opm/simulators/linalg/bda/openclWellContributions.hpp
  opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
  opm/simulators/linalg/bda/WellContributions.hpp
  opm/simulators/linalg/amgcpr.hh
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -262,18 +262,18 @@ namespace Opm
            bool use_fpga = bdaBridge->getUseFpga();
            if (use_gpu || use_fpga) {
                const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
-                WellContributions wellContribs(accelerator_mode, useWellConn_);
-                bdaBridge->initWellContributions(wellContribs);
+                auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
+                bdaBridge->initWellContributions(*wellContribs);

                // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
 #if HAVE_CUDA || HAVE_OPENCL
                if (!useWellConn_) {
-                    simulator_.problem().wellModel().getWellContributions(wellContribs);
+                    simulator_.problem().wellModel().getWellContributions(*wellContribs);
                }
 #endif

                // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
+                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
                if (result.converged) {
                    // get result vector x from non-Dune backend, iff solve was successful
                    bdaBridge->get_result(x);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -35,6 +35,7 @@

 #if HAVE_OPENCL
 #include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
 #endif

 #if HAVE_FPGA
@ -273,7 +274,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
    if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
        const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
-        wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
+        static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -25,178 +25,68 @@

 #include <opm/simulators/linalg/bda/WellContributions.hpp>

+#ifdef HAVE_OPENCL
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
+#endif
+
+#ifdef HAVE_CUDA
+#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
+#endif
+
 namespace Opm
 {

-#if HAVE_OPENCL
-using Opm::Accelerator::OpenclKernels;
-#endif
-
-WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
+std::unique_ptr<WellContributions>
+WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
+{
    if(accelerator_mode.compare("cusparse") == 0){
-        cuda_gpu = true;
+#if HAVE_CUDA
+    return std::make_unique<WellContributionsCuda>();
+#else
+    OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
+#endif
    }
    else if(accelerator_mode.compare("opencl") == 0){
-        opencl_gpu = true;
+#if HAVE_OPENCL
+        return std::make_unique<WellContributionsOCL>();
+#else
+        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
+#endif
    }
    else if(accelerator_mode.compare("fpga") == 0){
        // unused for FPGA, but must be defined to avoid error
+        return std::make_unique<WellContributions>();
    }
    else if(accelerator_mode.compare("amgcl") == 0){
        if (!useWellConn) {
            OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true");
        }
+        return std::make_unique<WellContributions>();
    }
    else{
        OPM_THROW(std::logic_error, "Invalid accelerator mode");
    }
 }

-WellContributions::~WellContributions()
+void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
+                                  [[maybe_unused]] int* colIndices,
+                                  [[maybe_unused]] double* values,
+                                  [[maybe_unused]] unsigned int val_size)
 {
-    multisegments.clear();
-
-#if HAVE_CUDA
-    if(cuda_gpu){
-        freeCudaMemory(); // should come before 'delete[] h_x'
-    }
+#if !HAVE_CUDA && !HAVE_OPENCL
+    OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
 #endif

-#if HAVE_OPENCL
-    if(opencl_gpu){
-        if(num_ms_wells > 0){
-            delete[] h_x;
-            delete[] h_y;
-        }
-    }
-#endif
-}
-
-#if HAVE_OPENCL
-void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){
-    this->context = context_;
-    this->queue = queue_;
-}
-
-void WellContributions::setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
-                                  Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
-    this->kernel = kernel_;
-    this->kernel_no_reorder = kernel_no_reorder_;
-}
-
-void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
-{
-    this->h_toOrder = h_toOrder_;
-    this->reorder = reorder_;
-}
-
-void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
-    if (reorder) {
-        OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
-            d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
-    } else {
-        OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
-            d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
-    }
-}
-
-void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
-    if(h_x == nullptr){
-        h_x = new double[N];
-        h_y = new double[N];
-    }
-
-    events.resize(2);
-    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
-    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
-    cl::WaitForEvents(events);
-    events.clear();
-
-    // actually apply MultisegmentWells
-    for (auto& well : multisegments) {
-        well->setReordering(h_toOrder, reorder);
-        well->apply(h_x, h_y);
-    }
-
-    // copy vector y from CPU to GPU
-    events.resize(1);
-    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]);
-    events[0].wait();
-    events.clear();
-}
-
-void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
-    if(num_std_wells > 0){
-        apply_stdwells(d_x, d_y, d_toOrder);
-    }
-
-    if(num_ms_wells > 0){
-        apply_mswells(d_x, d_y);
-    }
-}
-#endif
-
-void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
-{
    if (!allocated) {
        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

-#if HAVE_CUDA
-    if(cuda_gpu){
-        addMatrixGpu(type, colIndices, values, val_size);
-    }
-#endif
-
-#if HAVE_OPENCL
-    if(opencl_gpu){
-        switch (type) {
-        case MatrixType::C:
-            events.resize(2);
-            queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
-            queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
-            cl::WaitForEvents(events);
-            events.clear();
-            break;
-
-        case MatrixType::D:
-            events.resize(1);
-            queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
-            events[0].wait();
-            events.clear();
-            break;
-
-        case MatrixType::B:
-            events.resize(2);
-            queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
-            queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
-            cl::WaitForEvents(events);
-            events.clear();
-
-            val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-            if (num_std_wells_so_far == num_std_wells - 1) {
-                val_pointers[num_std_wells] = num_blocks;
-                events.resize(1);
-                queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
-                events[0].wait();
-                events.clear();
-            }
-            break;
-
-        default:
-            OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
-        }
-    }
-#endif
+    this->APIaddMatrix(type, colIndices, values, val_size);

    if(MatrixType::B == type) {
        num_blocks_so_far += val_size;
        num_std_wells_so_far++;
    }
-
-#if !HAVE_CUDA && !HAVE_OPENCL
-    OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
-#endif
 }

 void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
@ -225,22 +115,7 @@ void WellContributions::alloc()
    if (num_std_wells > 0) {
        val_pointers.resize(num_std_wells+1);

-#if HAVE_CUDA
-        if(cuda_gpu){
-            allocStandardWells();
-        }
-#endif
-
-#if HAVE_OPENCL
-        if(opencl_gpu){
-            d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-            d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-            d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-            d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-            d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-            d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
-        }
-#endif
+        this->APIalloc();
        allocated = true;
    }
 }
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -20,15 +20,6 @@
 #ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
 #define WELLCONTRIBUTIONS_HEADER_INCLUDED

-#if HAVE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#if HAVE_OPENCL
-#include <opm/simulators/linalg/bda/opencl.hpp>
-#include <opm/simulators/linalg/bda/openclKernels.hpp>
-#endif
-
 #include <memory>
 #include <vector>

@ -61,6 +52,8 @@ namespace Opm
 class WellContributions
 {
 public:
+    static std::unique_ptr<WellContributions> create(const std::string& accelerator_mode, bool useWellConn);
+
 #if DUNE_VERSION_NEWER(DUNE_ISTL, 2, 7)
    using UMFPackIndex = SuiteSparse_long;
 #else
@ -73,9 +66,7 @@ public:
        B
    };

-private:
-    bool opencl_gpu = false;
-    bool cuda_gpu = false;
+protected:
    bool allocated = false;

    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
@ -88,80 +79,9 @@ private:
    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
    std::vector<unsigned int> val_pointers;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols

-    double *h_x = nullptr;
-    double *h_y = nullptr;
    std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;

-#if HAVE_OPENCL
-    cl::Context *context;
-    cl::CommandQueue *queue;
-    Opm::Accelerator::stdwell_apply_kernel_type *kernel;
-    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
-    std::vector<cl::Event> events;
-
-    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
-    std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
-    std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
-
-    bool reorder = false;
-    int *h_toOrder = nullptr;
-#endif
-
-#if HAVE_CUDA
-    cudaStream_t stream;
-
-    // data for StandardWells, could remain nullptrs if not used
-    double *d_Cnnzs = nullptr;
-    double *d_Dnnzs = nullptr;
-    double *d_Bnnzs = nullptr;
-    int *d_Ccols = nullptr;
-    int *d_Bcols = nullptr;
-    double *d_z1 = nullptr;
-    double *d_z2 = nullptr;
-    unsigned int *d_val_pointers = nullptr;
-
-    /// Allocate GPU memory for StandardWells
-    void allocStandardWells();
-
-    /// Free GPU memory allocated with cuda.
-    void freeCudaMemory();
-
-    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
-    /// \param[in] type        indicate if C, D or B is sent
-    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
-    /// \param[in] values      array of nonzeroes
-    /// \param[in] val_size    number of blocks in C or B, ignored for D
-    void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
-#endif
-
 public:
-#if HAVE_CUDA
-    /// Set a cudaStream to be used
-    /// \param[in] stream           the cudaStream that is used to launch the kernel in
-    void setCudaStream(cudaStream_t stream);
-
-    /// Apply all Wells in this object
-    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
-    /// \param[in] d_x        vector x, must be on GPU
-    /// \param[inout] d_y     vector y, must be on GPU
-    void apply(double *d_x, double *d_y);
-#endif
-
-#if HAVE_OPENCL
-    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
-                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
-    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
-
-    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
-    /// Those indices need to be mapped via toOrder
-    /// \param[in] toOrder    array with mappings
-    /// \param[in] reorder    whether reordering is actually used or not
-    void setReordering(int *toOrder, bool reorder);
-    void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
-    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
-    void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
-#endif
-
    unsigned int getNumWells(){
        return num_std_wells + num_ms_wells;
    }
@ -173,13 +93,8 @@ public:
    /// Allocate memory for the StandardWells
    void alloc();

-    /// Create a new WellContributions
-    /// \param[in] accelerator_mode    string indicating which solverBackend is used
-    /// \param[in] useWellConn         true iff wellcontributions are added to the matrix
-    WellContributions(std::string accelerator_mode, bool useWellConn);
-
-    /// Destroy a WellContributions, and free memory
-    ~WellContributions();
+    /// Empty destructor.
+    virtual ~WellContributions() = default;

    /// Indicate how large the blocks of the StandardWell (C and B) are
    /// \param[in] dim         number of columns
@ -212,6 +127,12 @@ public:
                                         unsigned int DnumBlocks, double *Dvalues,
                                         UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
                                         std::vector<double> &Cvalues);
+protected:
+    //! \brief API specific allocation.
+    virtual void APIalloc() {}
+
+    /// Api specific upload of matrix.
+    virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
 };
 } //namespace Opm

--- a/opm/simulators/linalg/bda/cuWellContributions.cu
+++ b/opm/simulators/linalg/bda/cuWellContributions.cu
@ -22,12 +22,13 @@
 #include <cstdlib>
 #include <cstring>

+#include "opm/simulators/linalg/bda/cuWellContributions.hpp"
+
 #include "opm/simulators/linalg/bda/cuda_header.hpp"
 #include <cuda_runtime.h>

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
-#include "opm/simulators/linalg/bda/WellContributions.hpp"

 namespace Opm
 {
@ -125,18 +126,8 @@ __global__ void apply_well_contributions(

 }

-void WellContributions::allocStandardWells()
+WellContributionsCuda::~WellContributionsCuda()
 {
-    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
-    cudaCheckLastError("apply_gpu malloc failed");
-}
-
-void WellContributions::freeCudaMemory() {
    // delete data for StandardWell
    if (num_std_wells > 0) {
        cudaFree(d_Cnnzs);
@ -154,10 +145,20 @@ void WellContributions::freeCudaMemory() {
    }
 }

+void WellContributionsCuda::APIalloc()
+{
+    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
+    cudaCheckLastError("apply_gpu malloc failed");
+}

 // Apply the WellContributions, similar to StandardWell::apply()
 // y -= (C^T *(D^-1*(   B*x)))
-void WellContributions::apply(double *d_x, double *d_y)
+void WellContributionsCuda::apply(double *d_x, double *d_y)
 {
    // apply MultisegmentWells

@ -194,7 +195,7 @@ void WellContributions::apply(double *d_x, double *d_y)
 }


-void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
 {
    switch (type) {
    case MatrixType::C:
@ -219,7 +220,7 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v
    cudaCheckLastError("WellContributions::addMatrix() failed");
 }

-void WellContributions::setCudaStream(cudaStream_t stream_)
+void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
 {
    this->stream = stream_;
    for (auto& well : multisegments) {
--- a/opm/simulators/linalg/bda/cuWellContributions.hpp
+++ b/opm/simulators/linalg/bda/cuWellContributions.hpp
@ -0,0 +1,78 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
+#define WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <cuda_runtime.h>
+
+#include <memory>
+#include <vector>
+
+
+namespace Opm
+{
+
+class WellContributionsCuda : public WellContributions
+{
+public:
+    ~WellContributionsCuda() override;
+
+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used to launch the kernel in
+    void setCudaStream(cudaStream_t stream);
+
+    /// Apply all Wells in this object
+    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
+    /// \param[in] d_x        vector x, must be on GPU
+    /// \param[inout] d_y     vector y, must be on GPU
+    void apply(double *d_x, double *d_y);
+
+protected:
+    /// Allocate memory for the StandardWells
+    void APIalloc() override;
+
+    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
+    /// \param[in] type        indicate if C, D or B is sent
+    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
+    /// \param[in] values      array of nonzeroes
+    /// \param[in] val_size    number of blocks in C or B, ignored for D
+    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+
+    cudaStream_t stream;
+
+    // data for StandardWells, could remain nullptrs if not used
+    double *d_Cnnzs = nullptr;
+    double *d_Dnnzs = nullptr;
+    double *d_Bnnzs = nullptr;
+    int *d_Ccols = nullptr;
+    int *d_Bcols = nullptr;
+    double *d_z1 = nullptr;
+    double *d_z2 = nullptr;
+    unsigned int *d_val_pointers = nullptr;
+    double* h_x = nullptr;
+    double* h_y = nullptr;
+
+};
+
+} //namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -26,6 +26,7 @@
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/cuda_header.hpp>

@ -72,7 +73,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
    float it;

    if (wellContribs.getNumWells() > 0) {
-        wellContribs.setCudaStream(stream);
+        static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
    }

    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -116,7 +117,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            wellContribs.apply(d_pw, d_v);
+            static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
        }

        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -147,7 +148,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            wellContribs.apply(d_s, d_t);
+            static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
        }

        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -26,6 +26,7 @@
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>

 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
@ -262,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        // apply wellContributions
        t_well.start();
        if(wellContribs.getNumWells() > 0){
-            wellContribs.apply(d_pw, d_v, d_toOrder);
+            static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v, d_toOrder);
        }
        t_well.stop();

@ -293,7 +294,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        // apply wellContributions
        t_well.start();
        if(wellContribs.getNumWells() > 0){
-            wellContribs.apply(d_s, d_t, d_toOrder);
+            static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t, d_toOrder);
        }
        t_well.stop();

@ -520,10 +521,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
    mat->nnzValues = vals;
    if (opencl_ilu_reorder != ILUReorder::NONE) {
        reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
-        wellContribs.setReordering(toOrder, true);
+        static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
    } else {
        rb = b;
-        wellContribs.setReordering(nullptr, false);
+        static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
    }

    if (verbosity > 2) {
--- a/opm/simulators/linalg/bda/openclWellContributions.cpp
+++ b/opm/simulators/linalg/bda/openclWellContributions.cpp
@ -0,0 +1,154 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h> // CMake
+
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
+
+#include <cstdlib>
+#include <cstring>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+
+namespace Opm
+{
+
+using Accelerator::OpenclKernels;
+
+void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
+    this->context = context_;
+    this->queue = queue_;
+ }
+
+void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
+                                     Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
+    this->kernel = kernel_;
+    this->kernel_no_reorder = kernel_no_reorder_;
+}
+
+void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
+{
+    this->h_toOrder = h_toOrder_;
+    this->reorder = reorder_;
+}
+
+void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    if (reorder) {
+        OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
+            d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
+    } else {
+        OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
+            d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
+    }
+}
+
+void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
+    if (h_x.empty()) {
+        h_x.resize(N);
+        h_y.resize(N);
+    }
+
+    events.resize(2);
+    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
+    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
+    cl::WaitForEvents(events);
+    events.clear();
+
+    // actually apply MultisegmentWells
+    for (auto& well : multisegments) {
+        well->setReordering(h_toOrder, reorder);
+        well->apply(h_x.data(), h_y.data());
+    }
+
+    // copy vector y from CPU to GPU
+    events.resize(1);
+    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
+    events[0].wait();
+    events.clear();
+}
+
+void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    if(num_std_wells > 0){
+        apply_stdwells(d_x, d_y, d_toOrder);
+    }
+
+    if(num_ms_wells > 0){
+        apply_mswells(d_x, d_y);
+    }
+}
+
+void WellContributionsOCL::APIaddMatrix(MatrixType type,
+                                        int* colIndices,
+                                        double* values,
+                                        unsigned int val_size)
+{
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
+    }
+
+    switch (type) {
+    case MatrixType::C:
+        events.resize(2);
+        queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+        cl::WaitForEvents(events);
+        events.clear();
+        break;
+
+    case MatrixType::D:
+        events.resize(1);
+        queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
+        events[0].wait();
+        events.clear();
+        break;
+
+    case MatrixType::B:
+        events.resize(2);
+        queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+        cl::WaitForEvents(events);
+        events.clear();
+
+        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+        if (num_std_wells_so_far == num_std_wells - 1) {
+            val_pointers[num_std_wells] = num_blocks;
+            events.resize(1);
+            queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
+            events[0].wait();
+            events.clear();
+        }
+        break;
+
+    default:
+        OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributionsOCL::addMatrix()");
+    }
+}
+
+void WellContributionsOCL::APIalloc()
+{
+    d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+    d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+    d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+    d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+    d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+    d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
+}
+
+} //namespace Opm
--- a/opm/simulators/linalg/bda/openclWellContributions.hpp
+++ b/opm/simulators/linalg/bda/openclWellContributions.hpp
@ -0,0 +1,75 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
+#define WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
+#include <memory>
+#include <vector>
+
+
+namespace Opm
+{
+
+class WellContributionsOCL : public WellContributions
+{
+public:
+    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
+                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
+    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
+
+    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether reordering is actually used or not
+    void setReordering(int* toOrder, bool reorder);
+    void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
+    void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+
+protected:
+    /// Allocate memory for the StandardWells
+    void APIalloc() override;
+
+    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+
+    cl::Context* context;
+    cl::CommandQueue* queue;
+    Opm::Accelerator::stdwell_apply_kernel_type* kernel;
+    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
+    std::vector<cl::Event> events;
+
+    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
+    std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
+    std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
+
+    bool reorder = false;
+    int *h_toOrder = nullptr;
+    std::vector<double> h_x;
+    std::vector<double> h_y;
+};
+
+} //namespace Opm
+
+#endif
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@ -81,7 +81,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
-    Opm::WellContributions wellContribs("cusparse", false);
+    auto wellContribs = Opm::WellContributions::create("cusparse", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
@ -89,7 +89,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
        BOOST_WARN_MESSAGE(true, error.what());
        throw DeviceInitException(error.what());
    }
-    bridge->solve_system(&matrix, rhs, wellContribs, result);
+    bridge->solve_system(&matrix, rhs, *wellContribs, result);
    bridge->get_result(x);

    return x;
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@ -80,7 +80,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
-    Opm::WellContributions wellContribs("opencl", false);
+    auto wellContribs = Opm::WellContributions::create("opencl", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
@ -88,7 +88,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
        BOOST_WARN_MESSAGE(true, error.what());
        throw PlatformInitException(error.what());
    }
-    bridge->solve_system(&matrix, rhs, wellContribs, result);
+    bridge->solve_system(&matrix, rhs, *wellContribs, result);
    bridge->get_result(x);

    return x;