Merge pull request #3691 from akva2/bda_wellcontrib_refactor

Refactor BDA well contributions
2025-02-25 18:55:30 -06:00 · 2021-11-15 10:26:44 +01:00 · 2021-11-15 10:26:44 +01:00 · afc5ed751f
commit afc5ed751f
parent ec56782288 069882f3e4
14 changed files with 399 additions and 291 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -49,6 +49,7 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/linalg/FlexibleSolver6.cpp
  opm/simulators/linalg/PropertyTree.cpp
  opm/simulators/linalg/setupPropertyTree.cpp
+  opm/simulators/linalg/bda/WellContributions.cpp
  opm/simulators/utils/PartiallySupportedFlowKeywords.cpp
  opm/simulators/utils/readDeck.cpp
  opm/simulators/utils/UnsupportedFlowKeywords.cpp
@ -93,8 +94,7 @@ list (APPEND MAIN_SOURCE_FILES

 if(CUDA_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuWellContributions.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
@ -106,6 +106,7 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
@ -265,6 +266,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/opencl.hpp
  opm/simulators/linalg/bda/openclKernels.hpp
  opm/simulators/linalg/bda/openclSolverBackend.hpp
+  opm/simulators/linalg/bda/openclWellContributions.hpp
  opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
  opm/simulators/linalg/bda/WellContributions.hpp
  opm/simulators/linalg/amgcpr.hh
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -262,18 +262,18 @@ namespace Opm
            bool use_fpga = bdaBridge->getUseFpga();
            if (use_gpu || use_fpga) {
                const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
-                WellContributions wellContribs(accelerator_mode, useWellConn_);
-                bdaBridge->initWellContributions(wellContribs);
+                auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
+                bdaBridge->initWellContributions(*wellContribs);

                // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
 #if HAVE_CUDA || HAVE_OPENCL
                if (!useWellConn_) {
-                    simulator_.problem().wellModel().getWellContributions(wellContribs);
+                    simulator_.problem().wellModel().getWellContributions(*wellContribs);
                }
 #endif

                // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
+                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
                if (result.converged) {
                    // get result vector x from non-Dune backend, iff solve was successful
                    bdaBridge->get_result(x);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -19,6 +19,9 @@

 #include <config.h>

+#include "dune/istl/bcrsmatrix.hh"
+#include <opm/simulators/linalg/matrixblock.hh>
+
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/material/common/Unused.hpp>
@ -32,6 +35,7 @@

 #if HAVE_OPENCL
 #include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
 #endif

 #if HAVE_FPGA
@ -270,7 +274,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
    if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
        const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
-        wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
+        static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -22,17 +22,9 @@

 #include "dune/istl/solver.hh" // for struct InverseOperatorResult

-#include "dune/istl/bcrsmatrix.hh"
-#include <opm/simulators/linalg/matrixblock.hh>
-
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>

-
-#if HAVE_FPGA
-#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
-#endif
-
 namespace Opm
 {

--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -25,178 +25,68 @@

 #include <opm/simulators/linalg/bda/WellContributions.hpp>

+#ifdef HAVE_OPENCL
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
+#endif
+
+#ifdef HAVE_CUDA
+#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
+#endif
+
 namespace Opm
 {

-#if HAVE_OPENCL
-using Opm::Accelerator::OpenclKernels;
-#endif
-
-WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
+std::unique_ptr<WellContributions>
+WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
+{
    if(accelerator_mode.compare("cusparse") == 0){
-        cuda_gpu = true;
+#if HAVE_CUDA
+    return std::make_unique<WellContributionsCuda>();
+#else
+    OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
+#endif
    }
    else if(accelerator_mode.compare("opencl") == 0){
-        opencl_gpu = true;
+#if HAVE_OPENCL
+        return std::make_unique<WellContributionsOCL>();
+#else
+        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
+#endif
    }
    else if(accelerator_mode.compare("fpga") == 0){
        // unused for FPGA, but must be defined to avoid error
+        return std::make_unique<WellContributions>();
    }
    else if(accelerator_mode.compare("amgcl") == 0){
        if (!useWellConn) {
            OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true");
        }
+        return std::make_unique<WellContributions>();
    }
    else{
        OPM_THROW(std::logic_error, "Invalid accelerator mode");
    }
 }

-WellContributions::~WellContributions()
+void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
+                                  [[maybe_unused]] int* colIndices,
+                                  [[maybe_unused]] double* values,
+                                  [[maybe_unused]] unsigned int val_size)
 {
-    multisegments.clear();
-
-#if HAVE_CUDA
-    if(cuda_gpu){
-        freeCudaMemory(); // should come before 'delete[] h_x'
-    }
+#if !HAVE_CUDA && !HAVE_OPENCL
+    OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
 #endif

-#if HAVE_OPENCL
-    if(opencl_gpu){
-        if(num_ms_wells > 0){
-            delete[] h_x;
-            delete[] h_y;
-        }
-    }
-#endif
-}
-
-#if HAVE_OPENCL
-void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){
-    this->context = context_;
-    this->queue = queue_;
-}
-
-void WellContributions::setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
-                                  Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
-    this->kernel = kernel_;
-    this->kernel_no_reorder = kernel_no_reorder_;
-}
-
-void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
-{
-    this->h_toOrder = h_toOrder_;
-    this->reorder = reorder_;
-}
-
-void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
-    if (reorder) {
-        OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
-            d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
-    } else {
-        OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
-            d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
-    }
-}
-
-void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
-    if(h_x == nullptr){
-        h_x = new double[N];
-        h_y = new double[N];
-    }
-
-    events.resize(2);
-    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
-    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
-    cl::WaitForEvents(events);
-    events.clear();
-
-    // actually apply MultisegmentWells
-    for (auto& well : multisegments) {
-        well->setReordering(h_toOrder, reorder);
-        well->apply(h_x, h_y);
-    }
-
-    // copy vector y from CPU to GPU
-    events.resize(1);
-    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]);
-    events[0].wait();
-    events.clear();
-}
-
-void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
-    if(num_std_wells > 0){
-        apply_stdwells(d_x, d_y, d_toOrder);
-    }
-
-    if(num_ms_wells > 0){
-        apply_mswells(d_x, d_y);
-    }
-}
-#endif
-
-void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
-{
    if (!allocated) {
        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

-#if HAVE_CUDA
-    if(cuda_gpu){
-        addMatrixGpu(type, colIndices, values, val_size);
-    }
-#endif
-
-#if HAVE_OPENCL
-    if(opencl_gpu){
-        switch (type) {
-        case MatrixType::C:
-            events.resize(2);
-            queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
-            queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
-            cl::WaitForEvents(events);
-            events.clear();
-            break;
-
-        case MatrixType::D:
-            events.resize(1);
-            queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
-            events[0].wait();
-            events.clear();
-            break;
-
-        case MatrixType::B:
-            events.resize(2);
-            queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
-            queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
-            cl::WaitForEvents(events);
-            events.clear();
-
-            val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-            if (num_std_wells_so_far == num_std_wells - 1) {
-                val_pointers[num_std_wells] = num_blocks;
-                events.resize(1);
-                queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
-                events[0].wait();
-                events.clear();
-            }
-            break;
-
-        default:
-            OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
-        }
-    }
-#endif
+    this->APIaddMatrix(type, colIndices, values, val_size);

    if(MatrixType::B == type) {
        num_blocks_so_far += val_size;
        num_std_wells_so_far++;
    }
-
-#if !HAVE_CUDA && !HAVE_OPENCL
-    OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
-#endif
 }

 void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
@ -225,22 +115,7 @@ void WellContributions::alloc()
    if (num_std_wells > 0) {
        val_pointers.resize(num_std_wells+1);

-#if HAVE_CUDA
-        if(cuda_gpu){
-            allocStandardWells();
-        }
-#endif
-
-#if HAVE_OPENCL
-        if(opencl_gpu){
-            d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-            d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-            d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-            d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-            d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-            d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
-        }
-#endif
+        this->APIalloc();
        allocated = true;
    }
 }
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -20,15 +20,6 @@
 #ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
 #define WELLCONTRIBUTIONS_HEADER_INCLUDED

-#if HAVE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#if HAVE_OPENCL
-#include <opm/simulators/linalg/bda/opencl.hpp>
-#include <opm/simulators/linalg/bda/openclKernels.hpp>
-#endif
-
 #include <memory>
 #include <vector>

@ -61,6 +52,8 @@ namespace Opm
 class WellContributions
 {
 public:
+    static std::unique_ptr<WellContributions> create(const std::string& accelerator_mode, bool useWellConn);
+
 #if DUNE_VERSION_NEWER(DUNE_ISTL, 2, 7)
    using UMFPackIndex = SuiteSparse_long;
 #else
@ -73,9 +66,7 @@ public:
        B
    };

-private:
-    bool opencl_gpu = false;
-    bool cuda_gpu = false;
+protected:
    bool allocated = false;

    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
@ -88,80 +79,9 @@ private:
    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
    std::vector<unsigned int> val_pointers;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols

-    double *h_x = nullptr;
-    double *h_y = nullptr;
    std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;

-#if HAVE_OPENCL
-    cl::Context *context;
-    cl::CommandQueue *queue;
-    Opm::Accelerator::stdwell_apply_kernel_type *kernel;
-    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
-    std::vector<cl::Event> events;
-
-    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
-    std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
-    std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
-
-    bool reorder = false;
-    int *h_toOrder = nullptr;
-#endif
-
-#if HAVE_CUDA
-    cudaStream_t stream;
-
-    // data for StandardWells, could remain nullptrs if not used
-    double *d_Cnnzs = nullptr;
-    double *d_Dnnzs = nullptr;
-    double *d_Bnnzs = nullptr;
-    int *d_Ccols = nullptr;
-    int *d_Bcols = nullptr;
-    double *d_z1 = nullptr;
-    double *d_z2 = nullptr;
-    unsigned int *d_val_pointers = nullptr;
-
-    /// Allocate GPU memory for StandardWells
-    void allocStandardWells();
-
-    /// Free GPU memory allocated with cuda.
-    void freeCudaMemory();
-
-    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
-    /// \param[in] type        indicate if C, D or B is sent
-    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
-    /// \param[in] values      array of nonzeroes
-    /// \param[in] val_size    number of blocks in C or B, ignored for D
-    void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
-#endif
-
 public:
-#if HAVE_CUDA
-    /// Set a cudaStream to be used
-    /// \param[in] stream           the cudaStream that is used to launch the kernel in
-    void setCudaStream(cudaStream_t stream);
-
-    /// Apply all Wells in this object
-    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
-    /// \param[in] d_x        vector x, must be on GPU
-    /// \param[inout] d_y     vector y, must be on GPU
-    void apply(double *d_x, double *d_y);
-#endif
-
-#if HAVE_OPENCL
-    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
-                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
-    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
-
-    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
-    /// Those indices need to be mapped via toOrder
-    /// \param[in] toOrder    array with mappings
-    /// \param[in] reorder    whether reordering is actually used or not
-    void setReordering(int *toOrder, bool reorder);
-    void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
-    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
-    void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
-#endif
-
    unsigned int getNumWells(){
        return num_std_wells + num_ms_wells;
    }
@ -173,13 +93,8 @@ public:
    /// Allocate memory for the StandardWells
    void alloc();

-    /// Create a new WellContributions
-    /// \param[in] accelerator_mode    string indicating which solverBackend is used
-    /// \param[in] useWellConn         true iff wellcontributions are added to the matrix
-    WellContributions(std::string accelerator_mode, bool useWellConn);
-
-    /// Destroy a WellContributions, and free memory
-    ~WellContributions();
+    /// Empty destructor.
+    virtual ~WellContributions() = default;

    /// Indicate how large the blocks of the StandardWell (C and B) are
    /// \param[in] dim         number of columns
@ -212,6 +127,12 @@ public:
                                         unsigned int DnumBlocks, double *Dvalues,
                                         UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
                                         std::vector<double> &Cvalues);
+protected:
+    //! \brief API specific allocation.
+    virtual void APIalloc() {}
+
+    /// Api specific upload of matrix.
+    virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
 };
 } //namespace Opm

--- a/opm/simulators/linalg/bda/cuWellContributions.cu
+++ b/opm/simulators/linalg/bda/cuWellContributions.cu
@ -22,12 +22,13 @@
 #include <cstdlib>
 #include <cstring>

+#include "opm/simulators/linalg/bda/cuWellContributions.hpp"
+
 #include "opm/simulators/linalg/bda/cuda_header.hpp"
 #include <cuda_runtime.h>

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
-#include "opm/simulators/linalg/bda/WellContributions.hpp"

 namespace Opm
 {
@ -125,18 +126,8 @@ __global__ void apply_well_contributions(

 }

-void WellContributions::allocStandardWells()
+WellContributionsCuda::~WellContributionsCuda()
 {
-    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
-    cudaCheckLastError("apply_gpu malloc failed");
-}
-
-void WellContributions::freeCudaMemory() {
    // delete data for StandardWell
    if (num_std_wells > 0) {
        cudaFree(d_Cnnzs);
@ -154,10 +145,20 @@ void WellContributions::freeCudaMemory() {
    }
 }

+void WellContributionsCuda::APIalloc()
+{
+    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
+    cudaCheckLastError("apply_gpu malloc failed");
+}

 // Apply the WellContributions, similar to StandardWell::apply()
 // y -= (C^T *(D^-1*(   B*x)))
-void WellContributions::apply(double *d_x, double *d_y)
+void WellContributionsCuda::apply(double *d_x, double *d_y)
 {
    // apply MultisegmentWells

@ -194,7 +195,7 @@ void WellContributions::apply(double *d_x, double *d_y)
 }


-void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
 {
    switch (type) {
    case MatrixType::C:
@ -219,7 +220,7 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v
    cudaCheckLastError("WellContributions::addMatrix() failed");
 }

-void WellContributions::setCudaStream(cudaStream_t stream_)
+void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
 {
    this->stream = stream_;
    for (auto& well : multisegments) {
--- a/opm/simulators/linalg/bda/cuWellContributions.hpp
+++ b/opm/simulators/linalg/bda/cuWellContributions.hpp
@ -0,0 +1,78 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
+#define WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <cuda_runtime.h>
+
+#include <memory>
+#include <vector>
+
+
+namespace Opm
+{
+
+class WellContributionsCuda : public WellContributions
+{
+public:
+    ~WellContributionsCuda() override;
+
+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used to launch the kernel in
+    void setCudaStream(cudaStream_t stream);
+
+    /// Apply all Wells in this object
+    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
+    /// \param[in] d_x        vector x, must be on GPU
+    /// \param[inout] d_y     vector y, must be on GPU
+    void apply(double *d_x, double *d_y);
+
+protected:
+    /// Allocate memory for the StandardWells
+    void APIalloc() override;
+
+    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
+    /// \param[in] type        indicate if C, D or B is sent
+    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
+    /// \param[in] values      array of nonzeroes
+    /// \param[in] val_size    number of blocks in C or B, ignored for D
+    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+
+    cudaStream_t stream;
+
+    // data for StandardWells, could remain nullptrs if not used
+    double *d_Cnnzs = nullptr;
+    double *d_Dnnzs = nullptr;
+    double *d_Bnnzs = nullptr;
+    int *d_Ccols = nullptr;
+    int *d_Bcols = nullptr;
+    double *d_z1 = nullptr;
+    double *d_z2 = nullptr;
+    unsigned int *d_val_pointers = nullptr;
+    double* h_x = nullptr;
+    double* h_y = nullptr;
+
+};
+
+} //namespace Opm
+
+#endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -26,6 +26,7 @@
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/cuda_header.hpp>

@ -72,7 +73,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
    float it;

    if (wellContribs.getNumWells() > 0) {
-        wellContribs.setCudaStream(stream);
+        static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
    }

    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -116,7 +117,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            wellContribs.apply(d_pw, d_v);
+            static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
        }

        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -147,7 +148,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            wellContribs.apply(d_s, d_t);
+            static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
        }

        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -26,6 +26,7 @@
 #include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>

 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
@ -262,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        // apply wellContributions
        t_well.start();
        if(wellContribs.getNumWells() > 0){
-            wellContribs.apply(d_pw, d_v, d_toOrder);
+            static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v, d_toOrder);
        }
        t_well.stop();

@ -293,7 +294,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        // apply wellContributions
        t_well.start();
        if(wellContribs.getNumWells() > 0){
-            wellContribs.apply(d_s, d_t, d_toOrder);
+            static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t, d_toOrder);
        }
        t_well.stop();

@ -520,10 +521,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
    mat->nnzValues = vals;
    if (opencl_ilu_reorder != ILUReorder::NONE) {
        reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
-        wellContribs.setReordering(toOrder, true);
+        static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
    } else {
        rb = b;
-        wellContribs.setReordering(nullptr, false);
+        static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
    }

    if (verbosity > 2) {
--- a/opm/simulators/linalg/bda/openclWellContributions.cpp
+++ b/opm/simulators/linalg/bda/openclWellContributions.cpp
@ -0,0 +1,154 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h> // CMake
+
+#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
+
+#include <cstdlib>
+#include <cstring>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+
+namespace Opm
+{
+
+using Accelerator::OpenclKernels;
+
+void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
+    this->context = context_;
+    this->queue = queue_;
+ }
+
+void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
+                                     Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
+    this->kernel = kernel_;
+    this->kernel_no_reorder = kernel_no_reorder_;
+}
+
+void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
+{
+    this->h_toOrder = h_toOrder_;
+    this->reorder = reorder_;
+}
+
+void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    if (reorder) {
+        OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
+            d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
+    } else {
+        OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
+            d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
+    }
+}
+
+void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
+    if (h_x.empty()) {
+        h_x.resize(N);
+        h_y.resize(N);
+    }
+
+    events.resize(2);
+    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
+    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
+    cl::WaitForEvents(events);
+    events.clear();
+
+    // actually apply MultisegmentWells
+    for (auto& well : multisegments) {
+        well->setReordering(h_toOrder, reorder);
+        well->apply(h_x.data(), h_y.data());
+    }
+
+    // copy vector y from CPU to GPU
+    events.resize(1);
+    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
+    events[0].wait();
+    events.clear();
+}
+
+void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
+    if(num_std_wells > 0){
+        apply_stdwells(d_x, d_y, d_toOrder);
+    }
+
+    if(num_ms_wells > 0){
+        apply_mswells(d_x, d_y);
+    }
+}
+
+void WellContributionsOCL::APIaddMatrix(MatrixType type,
+                                        int* colIndices,
+                                        double* values,
+                                        unsigned int val_size)
+{
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
+    }
+
+    switch (type) {
+    case MatrixType::C:
+        events.resize(2);
+        queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+        cl::WaitForEvents(events);
+        events.clear();
+        break;
+
+    case MatrixType::D:
+        events.resize(1);
+        queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
+        events[0].wait();
+        events.clear();
+        break;
+
+    case MatrixType::B:
+        events.resize(2);
+        queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+        cl::WaitForEvents(events);
+        events.clear();
+
+        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+        if (num_std_wells_so_far == num_std_wells - 1) {
+            val_pointers[num_std_wells] = num_blocks;
+            events.resize(1);
+            queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
+            events[0].wait();
+            events.clear();
+        }
+        break;
+
+    default:
+        OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributionsOCL::addMatrix()");
+    }
+}
+
+void WellContributionsOCL::APIalloc()
+{
+    d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+    d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+    d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
+    d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+    d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
+    d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
+}
+
+} //namespace Opm
--- a/opm/simulators/linalg/bda/openclWellContributions.hpp
+++ b/opm/simulators/linalg/bda/openclWellContributions.hpp
@ -0,0 +1,75 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
+#define WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
+#include <memory>
+#include <vector>
+
+
+namespace Opm
+{
+
+class WellContributionsOCL : public WellContributions
+{
+public:
+    void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
+                   Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
+    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
+
+    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether reordering is actually used or not
+    void setReordering(int* toOrder, bool reorder);
+    void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+    void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
+    void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
+
+protected:
+    /// Allocate memory for the StandardWells
+    void APIalloc() override;
+
+    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+
+    cl::Context* context;
+    cl::CommandQueue* queue;
+    Opm::Accelerator::stdwell_apply_kernel_type* kernel;
+    Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
+    std::vector<cl::Event> events;
+
+    std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
+    std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
+    std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
+
+    bool reorder = false;
+    int *h_toOrder = nullptr;
+    std::vector<double> h_x;
+    std::vector<double> h_y;
+};
+
+} //namespace Opm
+
+#endif
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@ -81,18 +81,22 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
-    Opm::WellContributions wellContribs("cusparse", false);
+    auto wellContribs = Opm::WellContributions::create("cusparse", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
+
+        bridge->solve_system(&matrix, rhs, *wellContribs, result);
+        bridge->get_result(x);
+
+        return x;
    } catch (const std::logic_error& error) {
        BOOST_WARN_MESSAGE(true, error.what());
-        throw DeviceInitException(error.what());
+        if (strstr(error.what(), "Could not get device") != nullptr)
+            throw DeviceInitException(error.what());
+        else
+            throw error;
    }
-    bridge->solve_system(&matrix, rhs, wellContribs, result);
-    bridge->get_result(x);
-
-    return x;
 }

 namespace pt = boost::property_tree;
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@ -80,7 +80,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
-    Opm::WellContributions wellContribs("opencl", false);
+    auto wellContribs = Opm::WellContributions::create("opencl", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
@ -88,7 +88,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
        BOOST_WARN_MESSAGE(true, error.what());
        throw PlatformInitException(error.what());
    }
-    bridge->solve_system(&matrix, rhs, wellContribs, result);
+    bridge->solve_system(&matrix, rhs, *wellContribs, result);
    bridge->get_result(x);

    return x;