Merge pull request #2682 from Tongdongq/openclSolver

Added openclSolver
2025-02-25 18:55:30 -06:00 · 2020-07-10 13:38:32 +02:00 · 2020-07-10 13:38:32 +02:00 · faaee51d09
commit faaee51d09
parent 2212229481 5971a7ae9e
35 changed files with 4232 additions and 1050 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -134,6 +134,22 @@ if(CUDA_FOUND)
  include_directories(${CUDA_INCLUDE_DIRS})
 endif()

+
+find_package(OpenCL)
+
+if(OpenCL_FOUND)
+  # the current OpenCL implementation relies on cl.hpp, not cl2.hpp
+  # make sure it is available, otherwise disable OpenCL
+  find_file(CL_HPP CL/cl.hpp HINTS ${OpenCL_INCLUDE_DIRS})
+  if(CL_HPP)
+    set(HAVE_OPENCL 1)
+    include_directories(${OpenCL_INCLUDE_DIRS})
+  else()
+    message(WARNING " OpenCL was found, but this version of opm-simulators relies on CL/cl.hpp, which implements OpenCL 1.0, 1.1 and 1.2.\n Deactivating OpenCL")
+    set(OpenCL_FOUND OFF)
+  endif()
+endif()
+
 # read the list of components from this file (in the project directory);
 # it should set various lists with the names of the files to include
 include (CMakeLists_files.cmake)
@ -387,3 +403,7 @@ if(CUDA_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
 endif()

+if(OpenCL_FOUND)
+  target_link_libraries( opmsimulators ${OpenCL_LIBRARIES} )
+endif()
+
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -46,10 +46,21 @@ list (APPEND MAIN_SOURCE_FILES

 if(CUDA_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
+if(OPENCL_FOUND)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
+endif()
 if(MPI_FOUND)
  list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
                                opm/simulators/utils/ParallelSerialization.cpp)
@ -146,8 +157,15 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp
  opm/simulators/linalg/bda/BdaBridge.hpp
  opm/simulators/linalg/bda/BdaResult.hpp
+  opm/simulators/linalg/bda/BdaSolver.hpp
+  opm/simulators/linalg/bda/BILU0.hpp
+  opm/simulators/linalg/bda/BlockedMatrix.hpp
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+  opm/simulators/linalg/bda/Reorder.hpp
+  opm/simulators/linalg/bda/opencl.hpp
+  opm/simulators/linalg/bda/openclKernels.hpp
+  opm/simulators/linalg/bda/openclSolverBackend.hpp
  opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
  opm/simulators/linalg/bda/WellContributions.hpp
  opm/simulators/linalg/BlackoilAmg.hpp
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@ -6,6 +6,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_MPI
  HAVE_PETSC
  HAVE_CUDA
+  HAVE_OPENCL
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -67,7 +67,9 @@ NEW_PROP_TAG(CprEllSolvetype);
 NEW_PROP_TAG(CprReuseSetup);
 NEW_PROP_TAG(LinearSolverConfiguration);
 NEW_PROP_TAG(LinearSolverConfigurationJsonFile);
-NEW_PROP_TAG(UseGpu);
+NEW_PROP_TAG(GpuMode);
+NEW_PROP_TAG(BdaDeviceId);
+NEW_PROP_TAG(OpenclPlatformId);

 SET_SCALAR_PROP(FlowIstlSolverParams, LinearSolverReduction, 1e-2);
 SET_SCALAR_PROP(FlowIstlSolverParams, IluRelaxation, 0.9);
@ -94,7 +96,9 @@ SET_INT_PROP(FlowIstlSolverParams, CprEllSolvetype, 0);
 SET_INT_PROP(FlowIstlSolverParams, CprReuseSetup, 3);
 SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfiguration, "ilu0");
 SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfigurationJsonFile, "none");
-SET_BOOL_PROP(FlowIstlSolverParams, UseGpu, false);
+SET_STRING_PROP(FlowIstlSolverParams, GpuMode, "none");
+SET_INT_PROP(FlowIstlSolverParams, BdaDeviceId, 0);
+SET_INT_PROP(FlowIstlSolverParams, OpenclPlatformId, 0);



@ -167,7 +171,9 @@ namespace Opm
        bool scale_linear_system_;
        std::string linear_solver_configuration_;
        std::string linear_solver_configuration_json_file_;
-        bool use_gpu_;
+        std::string gpu_mode_;
+        int bda_device_id_;
+        int opencl_platform_id_;

        template <class TypeTag>
        void init()
@ -196,7 +202,9 @@ namespace Opm
            cpr_reuse_setup_  =  EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
            linear_solver_configuration_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfiguration);
            linear_solver_configuration_json_file_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile);
-            use_gpu_ = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
+            opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
        }

        template <class TypeTag>
@ -225,7 +233,9 @@ namespace Opm
            EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse Amg Setup");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfiguration, "Configuration of solver valid is: ilu0 (default), cpr_quasiimpes, cpr_trueimpes or file (specified in LinearSolverConfigurationJsonFile) ");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile, "Filename of JSON configuration for flexible linear solver system.");
-            EWOMS_REGISTER_PARAM(TypeTag, bool, UseGpu, "Use GPU cusparseSolver as the linear solver");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'");
+            EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
+            EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
        }

        FlowLinearSolverParameters() { reset(); }
@ -247,7 +257,9 @@ namespace Opm
            ilu_milu_                 = MILU_VARIANT::ILU;
            ilu_redblack_             = false;
            ilu_reorder_sphere_       = true;
-            use_gpu_                  = false;
+            gpu_mode_                 = "none";
+            bda_device_id_            = 0;
+            opencl_platform_id_       = 0;
        }
    };

--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -50,7 +50,7 @@

 #include <opm/common/utility/platform_dependent/reenable_warnings.h>

-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #endif

@ -122,8 +122,9 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
        enum { pressureVarIndex = Indices::pressureSwitchIdx };
        static const int numEq = Indices::numEq;

-#if HAVE_CUDA
-        std::unique_ptr<BdaBridge> bdaBridge;
+#if HAVE_CUDA || HAVE_OPENCL
+        static const unsigned int block_size = Matrix::block_type::rows;
+        std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
 #endif

 #if HAVE_MPI
@ -160,20 +161,22 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
                prm_ = setupPropertyTree<TypeTag>(parameters_);
            }
            const auto& gridForConn = simulator_.vanguard().grid();
-#if HAVE_CUDA
-            bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
-            if (gridForConn.comm().size() > 1 && use_gpu) {
+#if HAVE_CUDA || HAVE_OPENCL
+            std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
+            int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
+            if (gridForConn.comm().size() > 1 && gpu_mode.compare("none") != 0) {
                OpmLog::warning("Warning cannot use GPU with MPI, GPU is disabled");
-                use_gpu = false;
+                gpu_mode = "none";
            }
            const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
            const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
            const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
-            bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
+            bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
 #else
-            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
-            if (use_gpu) {
-                OPM_THROW(std::logic_error,"Error cannot use GPU solver since CUDA was not found during compilation");
+            const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            if (gpu_mode.compare("none") != 0) {
+                OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL was not found by cmake");
            }
 #endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
@ -454,7 +457,7 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
 #endif
            {
                // tries to solve linear system
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
                bool use_gpu = bdaBridge->getUseGpu();
                if (use_gpu) {
                    WellContributions wellContribs;
--- a/opm/simulators/linalg/MatrixBlock.hpp
+++ b/opm/simulators/linalg/MatrixBlock.hpp
@ -537,6 +537,214 @@ namespace Detail
        }
    }

+    //! perform out of place matrix inversion on C-style arrays
+    //! must have a specified block_size
+    template <int block_size>
+    struct Inverter
+    {
+        template <typename K>
+        void operator()(const K *matrix, K *inverse)
+        {
+            throw std::logic_error("Not implemented");
+        }
+    };
+
+    //! perform out of place matrix inversion on C-style arrays
+    template <>
+    struct Inverter<4>
+    {
+        template <typename K>
+        void operator()(const K *matrix, K *inverse)
+        {
+            // based on Dune::FMatrixHelp::invertMatrix
+            inverse[0] = matrix[5] * matrix[10] * matrix[15] -
+                    matrix[5] * matrix[11] * matrix[14] -
+                    matrix[9] * matrix[6] * matrix[15] +
+                    matrix[9] * matrix[7] * matrix[14] +
+                    matrix[13] * matrix[6] * matrix[11] -
+                    matrix[13] * matrix[7] * matrix[10];
+
+            inverse[4] = -matrix[4] * matrix[10] * matrix[15] +
+                    matrix[4] * matrix[11] * matrix[14] +
+                    matrix[8] * matrix[6] * matrix[15] -
+                    matrix[8] * matrix[7] * matrix[14] -
+                    matrix[12] * matrix[6] * matrix[11] +
+                    matrix[12] * matrix[7] * matrix[10];
+
+            inverse[8] = matrix[4] * matrix[9] * matrix[15] -
+                    matrix[4] * matrix[11] * matrix[13] -
+                    matrix[8] * matrix[5] * matrix[15] +
+                    matrix[8] * matrix[7] * matrix[13] +
+                    matrix[12] * matrix[5] * matrix[11] -
+                    matrix[12] * matrix[7] * matrix[9];
+
+            inverse[12] = -matrix[4] * matrix[9] * matrix[14] +
+                    matrix[4] * matrix[10] * matrix[13] +
+                    matrix[8] * matrix[5] * matrix[14] -
+                    matrix[8] * matrix[6] * matrix[13] -
+                    matrix[12] * matrix[5] * matrix[10] +
+                    matrix[12] * matrix[6] * matrix[9];
+
+            inverse[1]= -matrix[1]  * matrix[10] * matrix[15] +
+                    matrix[1] * matrix[11] * matrix[14] +
+                    matrix[9] * matrix[2] * matrix[15] -
+                    matrix[9] * matrix[3] * matrix[14] -
+                    matrix[13] * matrix[2] * matrix[11] +
+                    matrix[13] * matrix[3] * matrix[10];
+
+            inverse[5] = matrix[0] * matrix[10] * matrix[15] -
+                    matrix[0] * matrix[11] * matrix[14] -
+                    matrix[8] * matrix[2] * matrix[15] +
+                    matrix[8] * matrix[3] * matrix[14] +
+                    matrix[12] * matrix[2] * matrix[11] -
+                    matrix[12] * matrix[3] * matrix[10];
+
+            inverse[9] = -matrix[0] * matrix[9] * matrix[15] +
+                    matrix[0] * matrix[11] * matrix[13] +
+                    matrix[8] * matrix[1] * matrix[15] -
+                    matrix[8] * matrix[3] * matrix[13] -
+                    matrix[12] * matrix[1] * matrix[11] +
+                    matrix[12] * matrix[3] * matrix[9];
+
+            inverse[13] = matrix[0] * matrix[9] * matrix[14] -
+                    matrix[0] * matrix[10] * matrix[13] -
+                    matrix[8] * matrix[1] * matrix[14] +
+                    matrix[8] * matrix[2] * matrix[13] +
+                    matrix[12] * matrix[1] * matrix[10] -
+                    matrix[12] * matrix[2] * matrix[9];
+
+            inverse[2] = matrix[1] * matrix[6] * matrix[15] -
+                    matrix[1] * matrix[7] * matrix[14] -
+                    matrix[5] * matrix[2] * matrix[15] +
+                    matrix[5] * matrix[3] * matrix[14] +
+                    matrix[13] * matrix[2] * matrix[7] -
+                    matrix[13] * matrix[3] * matrix[6];
+
+            inverse[6] = -matrix[0]  * matrix[6] * matrix[15] +
+                    matrix[0] * matrix[7] * matrix[14] +
+                    matrix[4] * matrix[2] * matrix[15] -
+                    matrix[4] * matrix[3] * matrix[14] -
+                    matrix[12] * matrix[2] * matrix[7] +
+                    matrix[12] * matrix[3] * matrix[6];
+
+            inverse[10] = matrix[0] * matrix[5] * matrix[15] -
+                    matrix[0] * matrix[7] * matrix[13] -
+                    matrix[4] * matrix[1] * matrix[15] +
+                    matrix[4] * matrix[3] * matrix[13] +
+                    matrix[12] * matrix[1] * matrix[7] -
+                    matrix[12] * matrix[3] * matrix[5];
+
+            inverse[14] = -matrix[0] * matrix[5] * matrix[14] +
+                    matrix[0] * matrix[6] * matrix[13] +
+                    matrix[4] * matrix[1] * matrix[14] -
+                    matrix[4] * matrix[2] * matrix[13] -
+                    matrix[12] * matrix[1] * matrix[6] +
+                    matrix[12] * matrix[2] * matrix[5];
+
+            inverse[3] = -matrix[1] * matrix[6] * matrix[11] +
+                    matrix[1] * matrix[7] * matrix[10] +
+                    matrix[5] * matrix[2] * matrix[11] -
+                    matrix[5] * matrix[3] * matrix[10] -
+                    matrix[9] * matrix[2] * matrix[7] +
+                    matrix[9] * matrix[3] * matrix[6];
+
+            inverse[7] = matrix[0] * matrix[6] * matrix[11] -
+                    matrix[0] * matrix[7] * matrix[10] -
+                    matrix[4] * matrix[2] * matrix[11] +
+                    matrix[4] * matrix[3] * matrix[10] +
+                    matrix[8] * matrix[2] * matrix[7] -
+                    matrix[8] * matrix[3] * matrix[6];
+
+            inverse[11] = -matrix[0] * matrix[5] * matrix[11] +
+                    matrix[0] * matrix[7] * matrix[9] +
+                    matrix[4] * matrix[1] * matrix[11] -
+                    matrix[4] * matrix[3] * matrix[9] -
+                    matrix[8] * matrix[1] * matrix[7] +
+                    matrix[8] * matrix[3] * matrix[5];
+
+            inverse[15] = matrix[0] * matrix[5] * matrix[10] -
+                    matrix[0] * matrix[6] * matrix[9] -
+                    matrix[4] * matrix[1] * matrix[10] +
+                    matrix[4] * matrix[2] * matrix[9] +
+                    matrix[8] * matrix[1] * matrix[6] -
+                    matrix[8] * matrix[2] * matrix[5];
+
+            K det = matrix[0] * inverse[0] + matrix[1] * inverse[4] +
+                    matrix[2] * inverse[8] + matrix[3] * inverse[12];
+
+            // return identity for singular or nearly singular matrices.
+            if (std::abs(det) < 1e-40) {
+                for (int i = 0; i < 4; ++i){
+                    inverse[4*i + i] = 1.0;
+                }
+            }
+            K inv_det = 1.0 / det;
+
+            for (unsigned int i = 0; i < 4 * 4; ++i) {
+                inverse[i] *= inv_det;
+            }
+        }
+    };
+
+    //! perform out of place matrix inversion on C-style arrays
+    template <>
+    struct Inverter<3>
+    {
+        template <typename K>
+        void operator()(const K *matrix, K *inverse)
+        {
+            // code generated by maple, copied from Dune::DenseMatrix
+            K t4  = matrix[0] * matrix[4];
+            K t6  = matrix[0] * matrix[5];
+            K t8  = matrix[1] * matrix[3];
+            K t10 = matrix[2] * matrix[3];
+            K t12 = matrix[1] * matrix[6];
+            K t14 = matrix[2] * matrix[6];
+
+            K det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                          t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+            K t17 = 1.0 / det;
+
+            inverse[0] =  (matrix[4] * matrix[8] - matrix[5] * matrix[7]) * t17;
+            inverse[1] = -(matrix[1] * matrix[8] - matrix[2] * matrix[7]) * t17;
+            inverse[2] =  (matrix[1] * matrix[5] - matrix[2] * matrix[4]) * t17;
+            inverse[3] = -(matrix[3] * matrix[8] - matrix[5] * matrix[6]) * t17;
+            inverse[4] =  (matrix[0] * matrix[8] - t14) * t17;
+            inverse[5] = -(t6 - t10) * t17;
+            inverse[6] =  (matrix[3] * matrix[7] - matrix[4] * matrix[6]) * t17;
+            inverse[7] = -(matrix[0] * matrix[7] - t12) * t17;
+            inverse[8] =  (t4 - t8) * t17;
+        }
+    };
+
+    //! perform out of place matrix inversion on C-style arrays
+    template <>
+    struct Inverter<2>
+    {
+        template <typename K>
+        void operator()(const K *matrix, K *inverse)
+        {
+            // code based on Dune::DenseMatrix
+            K detinv = matrix[0] * matrix[3] - matrix[1] * matrix[2];
+            detinv = 1 / detinv;
+            inverse[0] =  matrix[3] * detinv;
+            inverse[1] = -matrix[1] * detinv;
+            inverse[2] = -matrix[2] * detinv;
+            inverse[3] =  matrix[0] * detinv;
+        }
+    };
+
+    //! perform out of place matrix inversion on C-style arrays
+    template <>
+    struct Inverter<1>
+    {
+        template <typename K>
+        void operator()(const K *matrix, K *inverse)
+        {
+            inverse[0] = 1.0 / matrix[0];
+        }
+    };
+
 } // namespace Detail
 } // namespace Opm

--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -0,0 +1,343 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/MatrixBlock.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+
+
+namespace bda
+{
+
+    using Opm::OpmLog;
+    using Dune::Timer;
+
+    template <unsigned int block_size>
+    BILU0<block_size>::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
+        verbosity(verbosity_), level_scheduling(level_scheduling_), graph_coloring(graph_coloring_)
+    {
+        if (level_scheduling == graph_coloring) {
+            OPM_THROW(std::logic_error, "Error, either level_scheduling or graph_coloring must be true, not both\n");
+        }
+    }
+
+    template <unsigned int block_size>
+    BILU0<block_size>::~BILU0()
+    {
+        delete[] invDiagVals;
+        delete[] diagIndex;
+        delete[] toOrder;
+        delete[] fromOrder;
+    }
+
+    template <unsigned int block_size>
+    bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
+    {
+        const unsigned int bs = block_size;
+
+        this->N = mat->Nb * block_size;
+        this->Nb = mat->Nb;
+        this->nnz = mat->nnzbs * block_size * block_size;
+        this->nnzbs = mat->nnzbs;
+
+        toOrder = new int[Nb];
+        fromOrder = new int[Nb];
+
+        int *CSCRowIndices = new int[nnzbs];
+        int *CSCColPointers = new int[Nb + 1];
+
+        Timer t_convert;
+        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
+        if(verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        Timer t_analysis;
+        rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
+        LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
+        if (level_scheduling) {
+            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+        } else if (graph_coloring) {
+            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+        }
+        if(verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
+            OpmLog::info(out.str());
+        }
+
+        delete[] CSCRowIndices;
+        delete[] CSCColPointers;
+
+        diagIndex = new int[mat->Nb];
+        invDiagVals = new double[mat->Nb * bs * bs];
+
+        Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+        Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+
+        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
+
+        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
+        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
+        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs);
+        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
+        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1));
+        s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
+        s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+
+        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
+        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
+        queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
+        queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
+        queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
+        queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
+        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals);
+
+        int *rowsPerColorPrefix = new int[numColors + 1];
+        int prefix = 0;
+        for (int i = 0; i < numColors + 1; ++i) {
+            rowsPerColorPrefix[i] = prefix;
+            prefix += rowsPerColor[i];
+        }
+        queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
+        delete[] rowsPerColorPrefix;
+
+        return true;
+    } // end init()
+
+
+    template <unsigned int block_size>
+    bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
+    {
+        const unsigned int bs = block_size;
+        Timer t_reorder;
+        reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
+
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+        Timer t_copy;
+        memcpy(LUmat->nnzValues, rmat->nnzValues, sizeof(double) * bs * bs * rmat->nnzbs);
+
+        if (verbosity >= 3){
+            std::ostringstream out;
+            out << "BILU0 memcpy: " << t_copy.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        int i, j, ij, ik, jk;
+        int iRowStart, iRowEnd, jRowEnd;
+        double pivot[bs * bs];
+
+        int LSize = 0;
+        Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks
+
+        Timer t_decomposition;
+
+        // go through all rows
+        for (i = 0; i < LUmat->Nb; i++) {
+            iRowStart = LUmat->rowPointers[i];
+            iRowEnd = LUmat->rowPointers[i + 1];
+
+            // go through all elements of the row
+            for (ij = iRowStart; ij < iRowEnd; ij++) {
+                j = LUmat->colIndices[ij];
+                // if the element is the diagonal, store the index and go to next row
+                if (j == i) {
+                    diagIndex[i] = ij;
+                    break;
+                }
+                // if an element beyond the diagonal is reach, no diagonal was found
+                // throw an error now. TODO: perform reordering earlier to prevent this
+                if (j > i) {
+                    std::ostringstream out;
+                    out << "BILU0 Error could not find diagonal value in row: " << i;
+                    OpmLog::error(out.str());
+                    return false;
+                }
+
+                LSize++;
+                // calculate the pivot of this row
+                blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
+
+                memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size);
+
+                jRowEnd = LUmat->rowPointers[j + 1];
+                jk = diagIndex[j] + 1;
+                ik = ij + 1;
+                // substract that row scaled by the pivot from this row.
+                while (ik < iRowEnd && jk < jRowEnd) {
+                    if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) {
+                        blockMultSub<bs>(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs);
+                        ik++;
+                        jk++;
+                    } else {
+                        if (LUmat->colIndices[ik] < LUmat->colIndices[jk])
+                        { ik++; }
+                        else
+                        { jk++; }
+                    }
+                }
+            }
+            // store the inverse in the diagonal!
+            inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
+
+            memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
+        }
+
+        Lmat->rowPointers[0] = 0;
+        Umat->rowPointers[0] = 0;
+
+        // Split the LU matrix into two by comparing column indices to diagonal indices
+        for (i = 0; i < LUmat->Nb; i++) {
+            int offsetL = Lmat->rowPointers[i];
+            int rowSize = diagIndex[i] - LUmat->rowPointers[i];
+            int offsetLU = LUmat->rowPointers[i];
+            memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
+            memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
+            offsetL += rowSize;
+            Lmat->rowPointers[i + 1] = offsetL;
+        }
+        // Reverse the order or the (blocked) rows for the U matrix,
+        // because the rows are accessed in reverse order when applying the ILU0
+        int URowIndex = 0;
+        for (i = LUmat->Nb - 1; i >= 0; i--) {
+            int offsetU = Umat->rowPointers[URowIndex];
+            int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1;
+            int offsetLU = diagIndex[i] + 1;
+            memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
+            memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
+            offsetU += rowSize;
+            Umat->rowPointers[URowIndex + 1] = offsetU;
+            URowIndex++;
+        }
+        if (verbosity >= 3) {
+            std::ostringstream out;
+            out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        Timer t_copyToGpu;
+        if (pattern_uploaded == false) {
+            queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
+            queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
+            queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
+            queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
+            pattern_uploaded = true;
+        }
+        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
+        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
+        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
+        if (verbosity >= 3) {
+            std::ostringstream out;
+            out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
+            OpmLog::info(out.str());
+        }
+
+        return true;
+    } // end create_preconditioner()
+
+    // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
+    // however, if individual kernel calls are timed, waiting for events is needed
+    // behavior on other GPUs is untested
+    template <unsigned int block_size>
+    void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
+    {
+        cl::Event event;
+        Timer t_apply;
+
+        for(int color = 0; color < numColors; ++color){
+            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            // event.wait();
+        }
+        for(int color = numColors-1; color >= 0; --color){
+            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            // event.wait();
+        }
+
+        if (verbosity >= 3) {
+            event.wait();
+            std::ostringstream out;
+            out << "BILU0 apply: " << t_apply.stop() << " s";
+            OpmLog::info(out.str());
+        }
+    }
+
+
+    template <unsigned int block_size>
+    void BILU0<block_size>::setOpenCLContext(cl::Context *context_){
+        this->context = context_;
+    }
+    template <unsigned int block_size>
+    void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_){
+        this->queue = queue_;
+    }
+    template <unsigned int block_size>
+    void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_){
+        this->work_group_size = work_group_size_;
+        this->total_work_items = total_work_items_;
+        this->lmem_per_work_group = lmem_per_work_group_;
+    }
+    template <unsigned int block_size>
+    void BILU0<block_size>::setKernels(
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
+    ){
+        this->ILU_apply1 = ILU_apply1_;
+        this->ILU_apply2 = ILU_apply2_;
+    }
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                     \
+template BILU0<n>::BILU0(bool, bool, int);                                               \
+template BILU0<n>::~BILU0();                                                             \
+template bool BILU0<n>::init(BlockedMatrix<n>*);                                         \
+template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*);                        \
+template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y);                             \
+template void BILU0<n>::setOpenCLContext(cl::Context*);                                  \
+template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);                               \
+template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int);   \
+template void BILU0<n>::setKernels(                                                      \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *  \
+    );
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // end namespace bda
+
+
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -0,0 +1,112 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BILU0_HPP
+#define BILU0_HPP
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+
+namespace bda
+{
+
+    /// This class implementa a Blocked ILU0 preconditioner
+    /// The decomposition is done on CPU, and reorders the rows of the matrix
+    template <unsigned int block_size>
+    class BILU0
+    {
+
+    private:
+        int N;       // number of rows of the matrix
+        int Nb;      // number of blockrows of the matrix
+        int nnz;     // number of nonzeroes of the matrix (scalar)
+        int nnzbs;   // number of blocks of the matrix
+        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr, LUmat = nullptr;
+        std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
+        double *invDiagVals = nullptr;
+        int *diagIndex = nullptr;
+        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
+        int *toOrder = nullptr, *fromOrder = nullptr;
+        int numColors;
+        int verbosity;
+
+        bool level_scheduling, graph_coloring;
+
+        typedef struct {
+            cl::Buffer Lvals, Uvals, invDiagVals;
+            cl::Buffer Lcols, Lrows;
+            cl::Buffer Ucols, Urows;
+            cl::Buffer rowsPerColor;
+        } GPU_storage;
+
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
+        GPU_storage s;
+        cl::Context *context;
+        cl::CommandQueue *queue;
+        int work_group_size = 0;
+        int total_work_items = 0;
+        int lmem_per_work_group = 0;
+        bool pattern_uploaded = false;
+
+    public:
+
+        BILU0(bool level_scheduling, bool graph_coloring, int verbosity);
+
+        ~BILU0();
+
+        // analysis
+        bool init(BlockedMatrix<block_size> *mat);
+
+        // ilu_decomposition
+        bool create_preconditioner(BlockedMatrix<block_size> *mat);
+
+        // apply preconditioner, y = prec(x)
+        void apply(cl::Buffer& x, cl::Buffer& y);
+
+        void setOpenCLContext(cl::Context *context);
+        void setOpenCLQueue(cl::CommandQueue *queue);
+        void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
+        void setKernels(
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
+            );
+
+        int* getToOrder()
+        {
+            return toOrder;
+        }
+
+        int* getFromOrder()
+        {
+            return fromOrder;
+        }
+
+        BlockedMatrix<block_size>* getRMat()
+        {
+            return rmat.get();
+        }
+
+    };
+
+} // end namespace bda
+
+#endif
+
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -35,11 +35,31 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
 namespace Opm
 {

-BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
-    : use_gpu(use_gpu_)
+    using bda::BdaResult;
+    using bda::BdaSolver;
+    using bda::SolverStatus;
+
+template <class BridgeMatrix, class BridgeVector, int block_size>
+BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID)
 {
-    if (use_gpu) {
-        backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+    if (gpu_mode.compare("cusparse") == 0) {
+#if HAVE_CUDA
+        use_gpu = true;
+        backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
+#else
+        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
+#endif
+    } else if (gpu_mode.compare("opencl") == 0) {
+#if HAVE_OPENCL
+        use_gpu = true;
+        backend.reset(new bda::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
+#else
+        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
+#endif
+    } else if (gpu_mode.compare("none") == 0) {
+        use_gpu = false;
+    } else {
+        OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
    }
 }

@ -112,8 +132,8 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector
 } // end getSparsityPattern()


-template <class BridgeMatrix, class BridgeVector>
-void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
+template <class BridgeMatrix, class BridgeVector, int block_size>
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
 {

    if (use_gpu) {
@ -159,21 +179,20 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
        /////////////////////////
        // actually solve

-        typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
-        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
+        SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
        switch(status) {
-        case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
-            //OpmLog::info("cusparseSolver converged");
+        case SolverStatus::BDA_SOLVER_SUCCESS:
+            //OpmLog::info("BdaSolver converged");
            break;
-        case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
-            OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+        case SolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
+            OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
            break;
-        case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
-            OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+        case SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
+            OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
            break;
        default:
-            OpmLog::warning("cusparseSolver returned unknown status code");
+            OpmLog::warning("BdaSolver returned unknown status code");
        }

        res.iterations = result.iterations;
@ -187,21 +206,30 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
 }


-template <class BridgeVector>
-void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
+template <class BridgeMatrix, class BridgeVector, int block_size>
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
    if (use_gpu) {
-        backend->post_process(static_cast<double*>(&(x[0][0])));
+        backend->get_result(static_cast<double*>(&(x[0][0])));
    }
 }

-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                  \
-template void BdaBridge::solve_system                                                                 \
-(Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >*, \
-    Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >&, \
-    WellContributions&, InverseOperatorResult&);                                                      \
-                                                                                                      \
-template void BdaBridge::get_result                                                                   \
-(Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >&);   \
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                \
+template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,              \
+Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
+n>::BdaBridge                                                                                                                       \
+(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);  \
+                                                                                                                                    \
+template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,         \
+Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
+n>::solve_system                                                                                                                    \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >*,                               \
+    Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >&,                               \
+    WellContributions&, InverseOperatorResult&);                                                                                    \
+                                                                                                                                    \
+template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,         \
+Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
+n>::get_result                                                                                                                      \
+(Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >&);                                 \

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
@ -210,6 +238,6 @@ INSTANTIATE_BDA_FUNCTIONS(4);

 #undef INSTANTIATE_BDA_FUNCTIONS

-}
+} // namespace Opm


--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -20,12 +20,6 @@
 #ifndef BDABRIDGE_HEADER_INCLUDED
 #define BDABRIDGE_HEADER_INCLUDED

-#include <config.h>
-
-#if ! HAVE_CUDA
-  #error "This file should only be included if CUDA is found"
-#endif
-
 #include "dune/istl/solver.hh" // for struct InverseOperatorResult

 #include "dune/istl/bcrsmatrix.hh"
@ -33,28 +27,36 @@

 #include <opm/simulators/linalg/bda/WellContributions.hpp>

+#if HAVE_CUDA
 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#endif
+
+#if HAVE_OPENCL
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#endif

 namespace Opm
 {

 typedef Dune::InverseOperatorResult InverseOperatorResult;

-/// BdaBridge acts as interface between opm-simulators with the cusparseSolver
-/// if CUDA was not found during CMake, function bodies of this class are empty
+/// BdaBridge acts as interface between opm-simulators with the BdaSolvers
+template <class BridgeMatrix, class BridgeVector, int block_size>
 class BdaBridge
 {
 private:
-    std::unique_ptr<cusparseSolverBackend> backend;
-    bool use_gpu;
+    std::unique_ptr<bda::BdaSolver<block_size> > backend;
+    bool use_gpu = false;

 public:
    /// Construct a BdaBridge
-    /// \param[in] use_gpu                    true iff the cusparseSolver is used, is passed via command-line: '--use-gpu=[true|false]'
-    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
-    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
-    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
-    BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
+    /// \param[in] gpu_mode                   to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
+    /// \param[in] linear_solver_verbosity    verbosity of BdaSolver
+    /// \param[in] maxit                      maximum number of iterations for BdaSolver
+    /// \param[in] tolerance                  required relative tolerance for BdaSolver
+    /// \param[in] platformID                 the OpenCL platform ID to be used
+    /// \param[in] deviceID                   the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
+    BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);


    /// Solve linear system, A*x = b
@ -63,12 +65,10 @@ public:
    /// \param[in] b            vector b, should be of type Dune::BlockVector
    /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] result    summary of solver result
-    template <class BridgeMatrix, class BridgeVector>
    void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);

    /// Get the resulting x vector
    /// \param[inout] x    vector x, should be of type Dune::BlockVector
-    template <class BridgeVector>
    void get_result(BridgeVector &x);

    /// Return whether the BdaBridge will use the GPU or not
--- a/opm/simulators/linalg/bda/BdaResult.hpp
+++ b/opm/simulators/linalg/bda/BdaResult.hpp
@ -20,7 +20,7 @@
 #ifndef BDARESULT_HEADER_INCLUDED
 #define BDARESULT_HEADER_INCLUDED

-namespace Opm
+namespace bda
 {

 /// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -0,0 +1,93 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
+
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+namespace bda
+{
+
+    using Opm::WellContributions;
+
+    enum class SolverStatus {
+        BDA_SOLVER_SUCCESS,
+        BDA_SOLVER_ANALYSIS_FAILED,
+        BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
+        BDA_SOLVER_UNKNOWN_ERROR
+    };
+
+    /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
+    /// This class is abstract, no instantiations can of it can be made, only of its children
+    template <unsigned int block_size>
+    class BdaSolver
+    {
+
+    protected:
+
+        // verbosity
+        // 0: print nothing during solves, only when initializing
+        // 1: print number of iterations and final norm
+        // 2: also print norm each iteration
+        // 3: also print timings of different backend functions
+
+        int verbosity = 0;
+
+        int maxit = 200;
+        double tolerance = 1e-2;
+
+        int N;           // number of rows
+        int Nb;          // number of blocked rows (Nb*block_size == N)
+        int nnz;         // number of nonzeroes (scalars)
+        int nnzb;        // number of nonzero blocks (nnzb*block_size*block_size == nnz)
+
+        unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
+        unsigned int deviceID = 0;   // ID of the device to be used
+
+        bool initialized = false;
+
+    public:
+
+        /// Construct a BdaSolver, can be cusparseSolver or openclSolver
+        /// \param[in] linear_solver_verbosity    verbosity of solver
+        /// \param[in] maxit                      maximum number of iterations for solver
+        /// \param[in] tolerance                  required relative tolerance for solver
+        /// \param[in] platformID                 the OpenCL platform to be used, only used in openclSolver
+        /// \param[in] deviceID                   the device to be used
+        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
+        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
+
+        /// Define virtual destructor, so that the derivedclass destructor will be called
+        virtual ~BdaSolver() {};
+
+        /// Define as pure virtual functions, so derivedclass must implement them
+        virtual SolverStatus solve_system(int N, int nnz, int dim,
+            double *vals, int *rows, int *cols,
+            double *b, WellContributions& wellContribs, BdaResult &res) = 0;
+
+        virtual void get_result(double *x) = 0;
+
+    }; // end class BdaSolver
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -0,0 +1,109 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cstring>
+#include <cmath>
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+using bda::BlockedMatrix;
+
+namespace bda
+{
+
+/*Sort a row of matrix elements from a blocked CSR-format.*/
+
+template <unsigned int block_size>
+void sortBlockedRow(int *colIndices, double *data, int left, int right) {
+    const unsigned int bs = block_size;
+    int l = left;
+    int r = right;
+    int middle = colIndices[(l + r) >> 1];
+    double lDatum[bs * bs];
+    do {
+        while (colIndices[l] < middle)
+            l++;
+        while (colIndices[r] > middle)
+            r--;
+        if (l <= r) {
+            int lColIndex = colIndices[l];
+            colIndices[l] = colIndices[r];
+            colIndices[r] = lColIndex;
+            memcpy(lDatum, data + l * bs * bs, sizeof(double) * bs * bs);
+            memcpy(data + l * bs * bs, data + r * bs * bs, sizeof(double) * bs * bs);
+            memcpy(data + r * bs * bs, lDatum, sizeof(double) * bs * bs);
+
+            l++;
+            r--;
+        }
+    } while (l < r);
+
+    if (left < r)
+        sortBlockedRow<bs>(colIndices, data, left, r);
+
+    if (right > l)
+        sortBlockedRow<bs>(colIndices, data, l, right);
+}
+
+
+// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
+// a = a - (b * c)
+template <unsigned int block_size>
+void blockMultSub(double *a, double *b, double *c)
+{
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            double temp = 0.0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += b[block_size * row + k] * c[block_size * k + col];
+            }
+            a[block_size * row + col] -= temp;
+        }
+    }
+}
+
+/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
+
+template <unsigned int block_size>
+void blockMult(double *mat1, double *mat2, double *resMat) {
+    for (unsigned int row = 0; row < block_size; row++) {
+        for (unsigned int col = 0; col < block_size; col++) {
+            double temp = 0;
+            for (unsigned int k = 0; k < block_size; k++) {
+                temp += mat1[block_size * row + k] * mat2[block_size * k + col];
+            }
+            resMat[block_size * row + col] = temp;
+        }
+    }
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                 \
+template void sortBlockedRow<n>(int *, double *, int, int);          \
+template void blockMultSub<n>(double *, double *, double *);         \
+template void blockMult<n>(double *, double *, double *);            \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // end namespace bda
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -0,0 +1,114 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BLOCKED_MATRIX_HPP
+#define BLOCKED_MATRIX_HPP
+
+namespace bda
+{
+
+/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
+/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
+template<int BS>
+struct BlockedMatrix{
+    /// Allocate BlockedMatrix and data arrays with given sizes
+    /// \param[in] Nb               number of blockrows
+    /// \param[in] nnzbs            number of nonzero blocks
+    BlockedMatrix(int Nb_, int nnzbs_)
+    : nnzValues(new double[nnzbs_*BS*BS]),
+      colIndices(new int[nnzbs_*BS*BS]),
+      rowPointers(new int[Nb_+1]),
+      Nb(Nb_),
+      nnzbs(nnzbs_),
+      deleteNnzs(true),
+      deleteSparsity(true)
+    {}
+    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
+    /// \param[in] M              matrix to be copied
+    BlockedMatrix(const BlockedMatrix& M)
+    : nnzValues(new double[M.nnzbs*BS*BS]),
+      colIndices(M.colIndices),
+      rowPointers(M.rowPointers),
+      Nb(M.Nb),
+      nnzbs(M.nnzbs),
+      deleteNnzs(true),
+      deleteSparsity(false)
+    {}
+    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
+    /// \param[in] Nb             number of blockrows
+    /// \param[in] nnzbs          number of nonzero blocks
+    /// \param[in] nnzValues      array of nonzero values, contains nnzb*BS*BS scalars
+    /// \param[in] colIndices     array of column indices, contains nnzb entries
+    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
+    BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
+    : nnzValues(nnzValues_),
+      colIndices(colIndices_),
+      rowPointers(rowPointers_),
+      Nb(Nb_),
+      nnzbs(nnzbs_),
+      deleteNnzs(false),
+      deleteSparsity(false)
+    {}
+
+    ~BlockedMatrix(){
+        if (deleteNnzs) {
+            delete[] nnzValues;
+        }
+        if (deleteSparsity) {
+            delete[] colIndices;
+            delete[] rowPointers;
+        }
+    }
+
+    double *nnzValues;
+    int *colIndices;
+    int *rowPointers;
+    int Nb;
+    int nnzbs;
+    bool deleteNnzs;
+    bool deleteSparsity;
+};
+
+
+/// Sort a row of matrix elements from a blocked CSR-format
+/// \param[inout] colIndices     
+/// \param[inout] data           
+/// \param[in] left              lower index of data of row
+/// \param[in] right             upper index of data of row
+template <unsigned int block_size>
+void sortBlockedRow(int *colIndices, double *data, int left, int right);
+
+/// Multiply and subtract blocks
+/// a = a - (b * c)
+/// \param[inout] a              block to be subtracted from
+/// \param[in] b                 input block
+/// \param[in] c                 input block
+template <unsigned int block_size>
+void blockMultSub(double *a, double *b, double *c);
+
+/// Perform a 3x3 matrix-matrix multiplication on two blocks
+/// \param[in] mat1              input block 1
+/// \param[in] mat2              input block 2
+/// \param[inout] resMat         output block
+template <unsigned int block_size>
+void blockMult(double *mat1, double *mat2, double *resMat);
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
@ -18,7 +18,6 @@
 */


-#include <cstdlib>
 #include <config.h> // CMake

 #if HAVE_UMFPACK
@ -30,94 +29,110 @@
 namespace Opm
 {

-        MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
-            unsigned int Nb_, unsigned int Mb_,
-            unsigned int BnumBlocks_, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-            unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
-            std::vector<double> &Cvalues)
-        :
-            dim(dim_),                // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
-            dim_wells(dim_wells_),    // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
-            N(Nb_*dim),               // number of rows in vectors x and y, N == dim*Nb
-            Nb(Nb_),                  // number of blockrows in x and y
-            M(Mb_*dim_wells),         // number of rows, M == dim_wells*Mb
-            Mb(Mb_),                  // number of blockrows in C, D and B
-            DnumBlocks(DnumBlocks_),  // number of blocks in D
-            BnumBlocks(BnumBlocks_),  // number of blocks in C and B
-            // copy data for matrix D into vectors to prevent it going out of scope
-            Dvals(Dvalues, Dvalues + DnumBlocks*dim_wells*dim_wells),
-            Dcols(DcolPointers, DcolPointers + M + 1),
-            Drows(DrowIndices, DrowIndices + DnumBlocks*dim_wells*dim_wells)
-        {
-            Cvals = std::move(Cvalues);
-            Bvals = std::move(Bvalues);
-            Bcols = std::move(BcolIndices);
-            Brows = std::move(BrowPointers);
+MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
+        unsigned int Nb_, unsigned int Mb_,
+        unsigned int BnumBlocks_, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+        unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
+        std::vector<double> &Cvalues)
+    :
+    dim(dim_),                // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
+    dim_wells(dim_wells_),    // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
+    N(Nb_ * dim),             // number of rows in vectors x and y, N == dim*Nb
+    Nb(Nb_),                  // number of blockrows in x and y
+    M(Mb_ * dim_wells),       // number of rows, M == dim_wells*Mb
+    Mb(Mb_),                  // number of blockrows in C, D and B
+    DnumBlocks(DnumBlocks_),  // number of blocks in D
+    BnumBlocks(BnumBlocks_),  // number of blocks in C and B
+    // copy data for matrix D into vectors to prevent it going out of scope
+    Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells),
+    Dcols(DcolPointers, DcolPointers + M + 1),
+    Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
+{
+    Cvals = std::move(Cvalues);
+    Bvals = std::move(Bvalues);
+    Bcols = std::move(BcolIndices);
+    Brows = std::move(BrowPointers);

-            z1.resize(Mb * dim_wells);
-            z2.resize(Mb * dim_wells);
+    z1.resize(Mb * dim_wells);
+    z2.resize(Mb * dim_wells);

-            umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
-            umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
-        }
+    umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
+    umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
+}

-        MultisegmentWellContribution::~MultisegmentWellContribution()
-        {
-            umfpack_di_free_symbolic(&UMFPACK_Symbolic);
-            umfpack_di_free_numeric(&UMFPACK_Numeric);
-        }
+MultisegmentWellContribution::~MultisegmentWellContribution()
+{
+    umfpack_di_free_symbolic(&UMFPACK_Symbolic);
+    umfpack_di_free_numeric(&UMFPACK_Numeric);
+}


-        // Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
-        // h_x and h_y reside on host
-        // y -= (C^T * (D^-1 * (B * x)))
-        void MultisegmentWellContribution::apply(double *h_x, double *h_y)
-        {
-            // reset z1 and z2
-            std::fill(z1.begin(), z1.end(), 0.0);
-            std::fill(z2.begin(), z2.end(), 0.0);
+// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
+// h_x and h_y reside on host
+// y -= (C^T * (D^-1 * (B * x)))
+void MultisegmentWellContribution::apply(double *h_x, double *h_y)
+{
+    // reset z1 and z2
+    std::fill(z1.begin(), z1.end(), 0.0);
+    std::fill(z2.begin(), z2.end(), 0.0);

-            // z1 = B * x
-            for (unsigned int row = 0; row < Mb; ++row) {
-                // for every block in the row
-                for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
-                    unsigned int colIdx = Bcols[blockID];
-                    for (unsigned int j = 0; j < dim_wells; ++j) {
-                        double temp = 0.0;
-                        for (unsigned int k = 0; k < dim; ++k) {
-                            temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
-                        }
-                        z1[row * dim_wells + j] += temp;
-                    }
-                }
-            }
-
-            // z2 = D^-1 * (B * x)
-            // umfpack
-            umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
-
-            // y -= (C^T * z2)
-            // y -= (C^T * (D^-1 * (B * x)))
-            for (unsigned int row = 0; row < Mb; ++row) {
-                // for every block in the row
-                for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
-                    unsigned int colIdx = Bcols[blockID];
-                    for (unsigned int j = 0; j < dim; ++j) {
-                        double temp = 0.0;
-                        for (unsigned int k = 0; k < dim_wells; ++k) {
-                            temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
-                        }
-                        h_y[colIdx * dim + j] -= temp;
-                    }
+    // z1 = B * x
+    for (unsigned int row = 0; row < Mb; ++row) {
+        // for every block in the row
+        for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
+            unsigned int colIdx = getColIdx(Bcols[blockID]);
+            for (unsigned int j = 0; j < dim_wells; ++j) {
+                double temp = 0.0;
+                for (unsigned int k = 0; k < dim; ++k) {
+                    temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
                }
+                z1[row * dim_wells + j] += temp;
            }
        }
+    }

-        void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
-        {
-            stream = stream_;
+    // z2 = D^-1 * (B * x)
+    // umfpack
+    umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
+
+    // y -= (C^T * z2)
+    // y -= (C^T * (D^-1 * (B * x)))
+    for (unsigned int row = 0; row < Mb; ++row) {
+        // for every block in the row
+        for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
+            unsigned int colIdx = getColIdx(Bcols[blockID]);
+            for (unsigned int j = 0; j < dim; ++j) {
+                double temp = 0.0;
+                for (unsigned int k = 0; k < dim_wells; ++k) {
+                    temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
+                }
+                h_y[colIdx * dim + j] -= temp;
+            }
        }
+    }
+}

+#if HAVE_CUDA
+void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
+{
+    stream = stream_;
+}
+#endif
+
+unsigned int MultisegmentWellContribution::getColIdx(unsigned int idx)
+{
+    if (reorder) {
+        return toOrder[idx];
+    } else {
+        return idx;
+    }
+}
+
+void MultisegmentWellContribution::setReordering(int *toOrder_, bool reorder_)
+{
+    this->toOrder = toOrder_;
+    this->reorder = reorder_;
+}

 } //namespace Opm

--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@ -20,89 +20,103 @@
 #ifndef MULTISEGMENTWELLCONTRIBUTION_HEADER_INCLUDED
 #define MULTISEGMENTWELLCONTRIBUTION_HEADER_INCLUDED

-#include <config.h>
-
 #include <vector>

+#if HAVE_CUDA
 #include <cuda_runtime.h>
+#endif

 namespace Opm
 {

-    /// This class serves to duplicate the functionality of the MultisegmentWell
-    /// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
-    /// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
-    /// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq. 
-    /// B*x and D*B*x are a vector with M*numWellEq doubles
-    /// C*D*B*x is a vector with N*numEq doubles.
-    
-    class MultisegmentWellContribution
-    {
+/// This class serves to duplicate the functionality of the MultisegmentWell
+/// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
+/// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
+/// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq.
+/// B*x and D*B*x are a vector with M*numWellEq doubles
+/// C*D*B*x is a vector with N*numEq doubles.

-    private:
-        unsigned int dim;                        // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
-        unsigned int dim_wells;                  // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
-        unsigned int N;                          // number of rows in vectors x and y, N == dim*Nb
-        unsigned int Nb;                         // number of blockrows in x and y
-        unsigned int M;                          // number of rows, M == dim_wells*Mb
-        unsigned int Mb;                         // number of blockrows in C, D and B
+class MultisegmentWellContribution
+{

-        cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
+private:
+    unsigned int dim;                        // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
+    unsigned int dim_wells;                  // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
+    unsigned int N;                          // number of rows in vectors x and y, N == dim*Nb
+    unsigned int Nb;                         // number of blockrows in x and y
+    unsigned int M;                          // number of rows, M == dim_wells*Mb
+    unsigned int Mb;                         // number of blockrows in C, D and B

-        // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
-        // Sparsity pattern for C is not stored, since it is the same as B
-        unsigned int DnumBlocks;             // number of blocks in D
-        unsigned int BnumBlocks;             // number of blocks in C and B
-        std::vector<double> Cvals;
-        std::vector<double> Dvals;
-        std::vector<double> Bvals;
-        std::vector<int> Dcols;              // Columnpointers, contains M+1 entries
-        std::vector<unsigned int> Bcols;
-        std::vector<int> Drows;              // Rowindicies, contains DnumBlocks*dim*dim_wells entries
-        std::vector<unsigned int> Brows;
-        std::vector<double> z1;          // z1 = B * x
-        std::vector<double> z2;          // z2 = D^-1 * B * x
-        void *UMFPACK_Symbolic, *UMFPACK_Numeric;
+#if HAVE_CUDA
+    cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
+#endif

+    // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
+    // Sparsity pattern for C is not stored, since it is the same as B
+    unsigned int DnumBlocks;             // number of blocks in D
+    unsigned int BnumBlocks;             // number of blocks in C and B
+    std::vector<double> Cvals;
+    std::vector<double> Dvals;
+    std::vector<double> Bvals;
+    std::vector<int> Dcols;              // Columnpointers, contains M+1 entries
+    std::vector<unsigned int> Bcols;
+    std::vector<int> Drows;              // Rowindicies, contains DnumBlocks*dim*dim_wells entries
+    std::vector<unsigned int> Brows;
+    std::vector<double> z1;          // z1 = B * x
+    std::vector<double> z2;          // z2 = D^-1 * B * x
+    void *UMFPACK_Symbolic, *UMFPACK_Numeric;

-    public:
+    int *toOrder = nullptr;
+    bool reorder = false;

-        /// Set a cudaStream to be used
-        /// \param[in] stream           the cudaStream that is used
-        void setCudaStream(cudaStream_t stream);
+    /// Translate the columnIndex if needed
+    /// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well
+    unsigned int getColIdx(unsigned int idx);

-        /// Create a new MultisegmentWellContribution
-        /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
-        /// The variables representing C, B and D will go out of scope when MultisegmentWell::addWellContribution() ends
-        /// \param[in] dim              size of blocks in blockvectors x and y, equal to MultisegmentWell::numEq
-        /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
-        /// \param[in] Nb               number of blocks in vectors x and y
-        /// \param[in] Mb               number of blockrows in C, B and D
-        /// \param[in] BnumBlocks       number of blocks in C and B
-        /// \param[in] Bvalues          nonzero values of matrix B
-        /// \param[in] BcolIndices      columnindices of blocks of matrix B
-        /// \param[in] BrowPointers     rowpointers of matrix B
-        /// \param[in] DnumBlocks       number of blocks in D
-        /// \param[in] Dvalues          nonzero values of matrix D
-        /// \param[in] DcolPointers     columnpointers of matrix D
-        /// \param[in] DrowIndices      rowindices of matrix D
-        /// \param[in] Cvalues          nonzero values of matrix C
-        MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
-            unsigned int Nb, unsigned int Mb,
-            unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-            unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
-            std::vector<double> &Cvalues);
+public:

-        /// Destroy a MultisegmentWellContribution, and free memory
-        ~MultisegmentWellContribution();
+#if HAVE_CUDA
+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used
+    void setCudaStream(cudaStream_t stream);
+#endif

-        /// Apply the MultisegmentWellContribution on CPU
-        /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
-        /// \param[in] h_x          vector x, must be on CPU
-        /// \param[inout] h_y       vector y, must be on CPU
-        void apply(double *h_x, double *h_y);
+    /// Create a new MultisegmentWellContribution
+    /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
+    /// The variables representing C, B and D will go out of scope when MultisegmentWell::addWellContribution() ends
+    /// \param[in] dim              size of blocks in blockvectors x and y, equal to MultisegmentWell::numEq
+    /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
+    /// \param[in] Nb               number of blocks in vectors x and y
+    /// \param[in] Mb               number of blockrows in C, B and D
+    /// \param[in] BnumBlocks       number of blocks in C and B
+    /// \param[in] Bvalues          nonzero values of matrix B
+    /// \param[in] BcolIndices      columnindices of blocks of matrix B
+    /// \param[in] BrowPointers     rowpointers of matrix B
+    /// \param[in] DnumBlocks       number of blocks in D
+    /// \param[in] Dvalues          nonzero values of matrix D
+    /// \param[in] DcolPointers     columnpointers of matrix D
+    /// \param[in] DrowIndices      rowindices of matrix D
+    /// \param[in] Cvalues          nonzero values of matrix C
+    MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+                                 unsigned int Nb, unsigned int Mb,
+                                 unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+                                 unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+                                 std::vector<double> &Cvalues);

-    };
+    /// Destroy a MultisegmentWellContribution, and free memory
+    ~MultisegmentWellContribution();
+
+    /// Apply the MultisegmentWellContribution on CPU
+    /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
+    /// \param[in] h_x          vector x, must be on CPU
+    /// \param[inout] h_y       vector y, must be on CPU
+    void apply(double *h_x, double *h_y);
+
+    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    void setReordering(int *toOrder, bool reorder);
+};

 } //namespace Opm

--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@ -0,0 +1,374 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <vector>
+#include <cstring>
+#include <algorithm> // for fill()
+#include <random>
+#include <limits>
+#include <sstream>
+
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+namespace bda
+{
+
+
+/* Give every node in the matrix (of which only the sparsity pattern in the
+ * form of row pointers and column indices arrays are in the input), a color
+ * in the colors array. Also return the amount of colors in the return integer. 
+ * This graph-coloring algorithm is based on the Jones-Plassmann algorithm, proposed in:
+ * "A Parallel Graph Coloring Heuristic" by M.T. Jones and P.E. Plassmann in SIAM Journal of Scientific Computing 14 (1993) */
+
+template <unsigned int block_size>
+int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor)
+{
+    int left, c;
+    const int max_tries = 100;            // since coloring is random, it is possible that a coloring fails. In that case, try again.
+    std::vector<int> randoms;
+    randoms.resize(rows);
+
+    std::vector<bool> visitedColumns;
+    visitedColumns.resize(rows);
+    std::fill(visitedColumns.begin(), visitedColumns.end(), false);
+
+    unsigned int colsInColor;
+    unsigned int additionalColsInRow;
+
+    for (unsigned int t = 0; t < max_tries; t++) {
+        // (re)initialize data for coloring process
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_int_distribution<> uniform(0, std::numeric_limits<int>::max());
+        {
+            for(int i = 0; i < rows; ++i){
+                randoms[i] = uniform(gen);
+            }
+        }
+        std::fill(colors.begin(), colors.end(), -1);
+
+        // actually perform coloring
+        for (c = 0; c < MAX_COLORS; c++) {
+            unsigned int rowsInColor = 0;
+            colsInColor = 0;
+            for (int i = 0; i < rows; i++)
+            {
+                bool iMax = true; // true iff you have max random
+
+                // ignore nodes colored earlier
+                if ((colors[i] != -1))
+                    continue;
+
+                int ir = randoms[i];
+
+                // look at all nodex that node i is connected to
+                for (int k = CSRRowPointers[i]; k < CSRRowPointers[i + 1]; k++) {
+                    // ignore nodes colored earlier (and yourself)
+                    int j = CSRColIndices[k];
+                    int jc = colors[j];
+                    if (((jc != -1) && (jc != c)) || (i == j)) {
+                        continue;
+                    }
+                    // node i is not in the current color if one of its neighbours shares this color, 
+                    if (jc == c) {
+                        iMax = false;
+                        break;
+                    }
+                    // or if one of its uncolored neighbours has a higher random value 
+                    int jr = randoms[j];
+                    if (ir <= jr) {
+                        iMax = false;
+                        break;
+                    }
+                }
+                // look at all nodes that have a connection to node i
+                for (int k = CSCColPointers[i]; k < CSCColPointers[i + 1]; k++) {
+                    // ignore nodes colored earlier (and yourself)
+                    int j = CSCRowIndices[k];
+                    int jc = colors[j];
+                    if (((jc != -1) && (jc != c)) || (i == j)) {
+                        continue;
+                    }
+                    // node i is not in the current color if one of its neighbours shares this color, 
+                    if (jc == c) {
+                        iMax = false;
+                        break;
+                    }
+                    // or if one of its uncolored neighbours has a higher random value 
+                    int jr = randoms[j];
+                    if (ir <= jr) {
+                        iMax = false;
+                        break;
+                    }
+                }
+
+                // assign color if you have the maximum random number
+                if (iMax) {
+                    additionalColsInRow = 0;
+                    for (int k = CSRRowPointers[i]; k < CSRRowPointers[i + 1]; k++) {
+                        int j = CSRColIndices[k];
+                        if (!visitedColumns[j]) {
+                            visitedColumns[j] = true;
+                            additionalColsInRow += block_size;
+                        }
+                    }
+                    if ((colsInColor + additionalColsInRow) > static_cast<unsigned int>(maxColsPerColor)) {
+                        break;
+                    }
+                    colsInColor += additionalColsInRow;
+                    colors[i] = c;
+                    rowsInColor += block_size;
+                    if ((rowsInColor + block_size - 1) >= static_cast<unsigned int>(maxRowsPerColor)) {
+                        break;
+                    }
+                }
+
+            }
+            // Check if graph coloring is done.
+            left = 0;
+            for (int k = 0; k < rows; k++) {
+                if (colors[k] == -1) {
+                    left++;
+                }
+            }
+            if (left == 0) {
+                return c + 1;
+            }
+        }
+    }
+
+    std::ostringstream oss;
+    oss << "Error could not find a graph coloring with " << c << " colors after " << max_tries << " tries.\nNumber of colorless nodes: " << left;
+    OPM_THROW(std::logic_error, oss.str());
+    return -1;
+}
+
+
+/* Reorder a matrix by a specified input order.
+ * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
+ * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
+
+template <unsigned int block_size>
+void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat) {
+    const unsigned int bs = block_size;
+    int rIndex = 0;
+    int i, k;
+    unsigned int j;
+
+    rmat->rowPointers[0] = 0;
+    for (i = 0; i < mat->Nb; i++) {
+        int thisRow = fromOrder[i];
+        // put thisRow from the old matrix into row i of the new matrix
+        rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
+        for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
+            for (j = 0; j < bs * bs; j++){
+                rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
+            }
+            rmat->colIndices[rIndex] = mat->colIndices[k];
+            rIndex++;
+        }
+    }
+    // re-assign column indices according to the new positions of the nodes referenced by the column indices
+    for (i = 0; i < mat->nnzbs; i++) {
+        rmat->colIndices[i] = toOrder[rmat->colIndices[i]];
+    }
+    // re-sort the column indices of every row.
+    for (i = 0; i < mat->Nb; i++) {
+        sortBlockedRow<bs>(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1);
+    }
+}
+
+/* Reorder a matrix according to the colors that every node of the matrix has received*/
+
+void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
+    int reordered = 0;
+    int i, c;
+
+    // Find reordering patterns
+    for (c = 0; c < numColors; c++) {
+        for (i = 0; i < Nb; i++) {
+            if (colors[i] == c) {
+                rowsPerColor[c]++;
+                toOrder[i] = reordered;
+                fromOrder[reordered] = i;
+                reordered++;
+            }
+        }
+    }
+}
+
+// Reorder a vector according to a reordering pattern
+
+template <unsigned int block_size>
+void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, double *rVector) {
+    for (int i = 0; i < Nb; i++) {
+        for (unsigned int j = 0; j < block_size; j++) {
+            rVector[block_size * i + j] = vector[block_size * fromOrder[i] + j];
+        }
+    }
+}
+
+
+/* Check is operations on a node in the matrix can be started
+ * A node can only be started if all nodes that it depends on during sequential execution have already completed.*/
+
+bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIndices, const std::vector<bool>& doneRows) {
+    bool canStart = !doneRows[rowIndex];
+    int i, thisDependency;
+    if (canStart) {
+        for (i = rowPointers[rowIndex]; i < rowPointers[rowIndex + 1]; i++) {
+            thisDependency = colIndices[i];
+            // Only dependencies on rows that should execute before the current one are relevant
+            if (thisDependency >= rowIndex)
+                break;
+            // Check if dependency has been resolved
+            if (!doneRows[thisDependency]) {
+                return false;
+            }
+        }
+    }
+    return canStart;
+}
+
+/*
+ * The level scheduling of a non-symmetric, blocked matrix requires access to a CSC encoding and a CSR encoding of the sparsity pattern of the input matrix.
+ * This function is based on a standard level scheduling algorithm, like the one described in:
+ * "Iterative methods for Sparse Linear Systems" by Yousef Saad in section 11.6.3 
+ */
+
+void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor) {
+    int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0;
+    int thisRow;
+    std::vector<bool> doneRows(Nb, false);
+    rowsPerColor.reserve(Nb);
+
+    std::vector <int> rowsToStart;
+
+    // find starting rows: rows that are independent from all rows that come before them.
+    for (thisRow = 0; thisRow < Nb; thisRow++) {
+        if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) {
+            fromOrder[nextActiveRowIndex] = thisRow;
+            toOrder[thisRow] = nextActiveRowIndex;
+            nextActiveRowIndex++;
+        }
+    }
+    // 'do' compute on all active rows
+    for (colorEnd = 0; colorEnd < nextActiveRowIndex; colorEnd++) {
+        doneRows[fromOrder[colorEnd]] = true;
+    }
+
+    rowsPerColor.emplace_back(nextActiveRowIndex - activeRowIndex);
+
+    while (colorEnd < Nb) {
+        // Go over all rows active from the last color, and check which of their neighbours can be activated this color
+        for (; activeRowIndex < colorEnd; activeRowIndex++) {
+            thisRow = fromOrder[activeRowIndex];
+
+            for (int i = CSCColPointers[thisRow]; i < CSCColPointers[thisRow + 1]; i++) {
+                int thatRow = CSCRowIndices[i];
+
+                if (canBeStarted(thatRow, CSRRowPointers, CSRColIndices, doneRows)) {
+                    rowsToStart.emplace_back(thatRow);
+                }
+            }
+        }
+        // 'do' compute on all active rows
+        for (unsigned int i = 0; i < rowsToStart.size(); i++) {
+            thisRow = rowsToStart[i];
+            if (!doneRows[thisRow]) {
+                doneRows[thisRow] = true;
+                fromOrder[nextActiveRowIndex] = thisRow;
+                toOrder[thisRow] = nextActiveRowIndex;
+                nextActiveRowIndex++;
+            }
+        }
+        colorEnd = nextActiveRowIndex;
+        rowsPerColor.emplace_back(nextActiveRowIndex - activeRowIndex);
+    }
+
+    *numColors = rowsPerColor.size();
+}
+
+/* Perform the complete graph coloring algorithm on a matrix. Return an array with the amount of nodes per color.*/
+
+template <unsigned int block_size>
+void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
+    std::vector<int> rowColor;
+    rowColor.resize(Nb);
+
+    *numColors = colorBlockedNodes<block_size>(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor);
+
+    rowsPerColor.resize(*numColors);
+    colorsToReordering(Nb, rowColor, *numColors, toOrder, fromOrder, rowsPerColor);
+}
+
+// based on the scipy package from python, scipy/sparse/sparsetools/csr.h on github
+void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb) {
+
+    int nnz = CSRRowPointers[Nb];
+
+    // compute number of nnzs per column
+    std::fill(CSCColPointers, CSCColPointers + Nb, 0);
+
+    for (int n = 0; n < nnz; ++n) {
+        CSCColPointers[CSRColIndices[n]]++;
+    }
+
+    // cumsum the nnz per col to get CSCColPointers
+    for (int col = 0, cumsum = 0; col < Nb; ++col) {
+        int temp = CSCColPointers[col];
+        CSCColPointers[col] = cumsum;
+        cumsum += temp;
+    }
+    CSCColPointers[Nb] = nnz;
+
+    for (int row = 0; row < Nb; ++row) {
+        for (int j = CSRRowPointers[row]; j < CSRRowPointers[row + 1]; ++j) {
+            int col = CSRColIndices[j];
+            int dest = CSCColPointers[col];
+            CSCRowIndices[dest] = row;
+            CSCColPointers[col]++;
+        }
+    }
+
+    for (int col = 0, last = 0; col <= Nb; ++col) {
+        int temp = CSCColPointers[col];
+        CSCColPointers[col] = last;
+        last = temp;
+    }
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                            \
+template int colorBlockedNodes<n>(int, const int *, const int *, const int *, const int *, std::vector<int>&, int, int);                        \
+template void reorderBlockedMatrixByPattern<n>(BlockedMatrix<n> *, int *, int *, BlockedMatrix<n> *);                                           \
+template void reorderBlockedVectorByPattern<n>(int, double*, int*, double*);                                                                    \
+template void findGraphColoring<n>(const int *, const int *, const int *, const int *, int, int, int, int *, int *, int *, std::vector<int>&);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} //namespace bda
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@ -0,0 +1,124 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef REORDER_HPP
+#define REORDER_HPP
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+namespace bda
+{
+
+#define MAX_COLORS 256
+
+/// Give every node in the matrix a color so that no neighbouring nodes share a color
+/// The color array must be allocated already
+/// This function with throw an error if no coloring can be found within the given restrictions
+/// This function does graph coloring based on random numbers
+/// \param[in] rows            number of rows in the matrix
+/// \param[in] CSRRowPointers  array of row pointers of the sparsity pattern stored in the CSR format
+/// \param[in] CSRColIndices   array of column indices of the sparsity pattern stored in the CSR format
+/// \param[in] CSCColPointers  array of column pointers of the sparsity pattern stored in the CSC format
+/// \param[in] CSCRowIndices   array of row indices of the sparsity pattern stored in the CSC format
+/// \param[inout] colors       output array containing the number of the color that each row is assigned to
+/// \param[in] maxRowsPerColor the maximum number of rows that are allowed in one color (so: the maximum number of nodes per color)
+/// \param[in] maxColsPerColor the maximum number of columns that the rows in a color are allowed to share (so: the maximum number of nodes that the nodes in one color may be connected to)
+/// \return                    the number of colors needed for the coloring
+template <unsigned int block_size>
+int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);
+
+/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
+/// rMat must be allocated already
+/// \param[in] mat           matrix to be reordered
+/// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rMat       reordered Matrix 
+template <unsigned int block_size>
+void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat);
+
+/// Compute reorder mapping from the color that each node has received
+/// The toOrder, fromOrder and iters arrays must be allocated already
+/// \param[in] Nb              number of blocks in the vector
+/// \param[in] colors          array containing the number of the color that each row is assigned to
+/// \param[in] numColors       the total number of colors into which all rows have been divided
+/// \param[inout] toOrder      reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[inout] fromOrder    reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rowsPerColor array containing for each color the number of rows that it contains
+void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor);
+
+/// Reorder a vector according to the mapping in fromOrder
+/// The rVector array must be allocated already
+/// \param[in] Nb            number of blocks in the vector
+/// \param[in] vector        vector to be reordered
+/// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rVector    reordered vector
+template <unsigned int block_size>
+void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, double *rVector);
+
+/// Determine whether all rows that a certain row depends on are done already
+/// \param[in] rowIndex      index of the row that needs to be checked for
+/// \param[in] rowPointers   row pointers of the matrix that the row is in
+/// \param[in] colIndices    column indices of the matrix that the row is in
+/// \param[in] doneRows      array that for each row lists whether it is done or not
+/// \return                  true iff all dependencies are done and if the result itself was not done yet
+bool canBeStarted(const int rowIndex, const  int *rowPointers, const  int *colIndices, const std::vector<bool>& doneRows);
+
+/// Find a level scheduling reordering for an input matrix
+/// The toOrder and fromOrder   arrays must be allocated already
+/// \param[in] CSRColIndices    column indices array, obtained from storing the input matrix in the CSR format
+/// \param[in] CSRRowPointers   row pointers array, obtained from storing the input matrix in the CSR format
+/// \param[in] CSCRowIndices    row indices array, obtained from storing the input matrix in the CSC format
+/// \param[in] CSCColPointers   column pointers array, obtained from storing the input matrix in the CSC format
+/// \param[in] Nb               number of blockrows in the matrix
+/// \param[out] numColors       a pointer to the number of colors needed for the level scheduling
+/// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
+/// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color
+void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
+
+/// Find a graph coloring reordering for an input matrix
+/// The toOrder and fromOrder arrays must be allocated already
+/// \param[in] CSRColIndices    column indices of the input sparsity pattern stored in the CSR format
+/// \param[in] CSRRowPointers   row pointers of the input sparsity pattern stored in the CSR format
+/// \param[in] CSCRowIndices    row indices of the input sparsity pattern stored in the CSC format
+/// \param[in] CSCColPointers   column pointers of the input sparsity pattern stored in the CSC format
+/// \param[in] Nb               number of blockrows in the matrix
+/// \param[in] maxRowsPerColor  the maximum number of rows that are allowed in one color (so: the maximum number of nodes per color)
+/// \param[in] maxColsPerColor  the maximum number of columns that the rows in a color are allowed to share (so: the maximum number of nodes that the nodes in one color may be connected to)
+/// \param[out] numColors       the number of colors used in the found graph coloring
+/// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
+/// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color
+template <unsigned int block_size>
+void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor);
+
+/// Convert a sparsity pattern stored in the CSR format to the CSC format
+/// CSCRowIndices and CSCColPointers arrays must be allocated already
+/// Based on the csr_tocsc() function from the scipy package from python, https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h
+/// \param[in] CSRColIndices     column indices of the CSR representation of the pattern
+/// \param[in] CSRRowPointers    row pointers of the CSR representation of the pattern
+/// \param[inout] CSCRowIndices  row indices of the result CSC representation of the pattern
+/// \param[inout] CSCColPointers column pointers of the result CSC representation of the pattern
+/// \param[in] Nb                number of blockrows in the matrix
+void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb);
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -0,0 +1,150 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <config.h> // CMake
+#include <cstdlib>
+#include <cstring>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include "opm/simulators/linalg/bda/WellContributions.hpp"
+
+namespace Opm
+{
+
+
+void WellContributions::alloc()
+{
+    if (num_std_wells > 0) {
+#if HAVE_CUDA
+        allocStandardWells();
+#else
+        OPM_THROW(std::logic_error, "Error cannot allocate on GPU for StandardWells because CUDA was not found by cmake");
+#endif
+    }
+}
+
+WellContributions::~WellContributions()
+{
+    if (h_x) {
+        delete[] h_x;
+        delete[] h_y;
+    }
+
+    // delete MultisegmentWellContributions
+    for (auto ms : multisegments) {
+        delete ms;
+    }
+    multisegments.clear();
+
+#if HAVE_CUDA
+    freeStandardWells();
+#endif
+}
+
+
+#if HAVE_OPENCL
+void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
+
+    // apply MultisegmentWells
+    if (num_ms_wells > 0) {
+        // allocate pinned memory on host if not yet done
+        if (h_x == nullptr) {
+            h_x = new double[N];
+            h_y = new double[N];
+        }
+
+        // copy vectors x and y from GPU to CPU
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
+        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
+
+        // actually apply MultisegmentWells
+        for (MultisegmentWellContribution *well : multisegments) {
+            well->apply(h_x, h_y);
+        }
+
+        // copy vector y from CPU to GPU
+        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
+    }
+
+
+    // apply StandardWells
+    if (num_std_wells > 0) {
+        OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
+    }
+}
+#endif
+
+void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+{
+#if HAVE_CUDA
+        addMatrixGpu(type, colIndices, values, val_size);
+#else
+        OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because CUDA was not found by cmake");
+#endif
+}
+
+
+void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
+{
+    dim = dim_;
+    dim_wells = dim_wells_;
+}
+
+void WellContributions::addNumBlocks(unsigned int nnz)
+{
+    if (allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
+    }
+    num_blocks += nnz;
+    num_std_wells++;
+}
+
+void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+        unsigned int Nb, unsigned int Mb,
+        unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+        std::vector<double> &Cvalues)
+{
+    this->N = Nb * dim;
+    MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
+    multisegments.emplace_back(well);
+    ++num_ms_wells;
+}
+
+
+void WellContributions::setReordering(int *toOrder_, bool reorder_)
+{
+    this->toOrder = toOrder_;
+    this->reorder = reorder_;
+    for (auto& ms : multisegments) {
+        ms->setReordering(toOrder_, reorder_);
+    }
+}
+
+#if HAVE_OPENCL
+void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_)
+{
+    this->queue = queue_;
+}
+#endif
+
+} //namespace Opm
+
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@ -18,14 +18,12 @@
 */


+#include <config.h> // CMake
 #include <cstdlib>
 #include <cstring>
-#include <config.h> // CMake

-#ifdef __NVCC__
 #include "opm/simulators/linalg/bda/cuda_header.hpp"
 #include <cuda_runtime.h>
-#endif

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
@ -34,248 +32,208 @@
 namespace Opm
 {

-    // apply WellContributions using y -= C^T * (D^-1 * (B * x))
-    __global__ void apply_well_contributions(
-        const double * __restrict__ Cnnzs,
-        const double * __restrict__ Dnnzs,
-        const double * __restrict__ Bnnzs,
-        const int * __restrict__ Ccols,
-        const int * __restrict__ Bcols,
-        const double * __restrict__ x,
-        double * __restrict__ y,
-        const int dim,
-        const int dim_wells,
-        const unsigned int * __restrict__ val_pointers
-        )
-    {
-        const int idx_b = blockIdx.x;
-        const int idx_t = threadIdx.x;
-        const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
+// apply WellContributions using y -= C^T * (D^-1 * (B * x))
+__global__ void apply_well_contributions(
+    const double * __restrict__ Cnnzs,
+    const double * __restrict__ Dnnzs,
+    const double * __restrict__ Bnnzs,
+    const int * __restrict__ Ccols,
+    const int * __restrict__ Bcols,
+    const double * __restrict__ x,
+    double * __restrict__ y,
+    const int dim,
+    const int dim_wells,
+    const unsigned int * __restrict__ val_pointers
+)
+{
+    const int idx_b = blockIdx.x;
+    const int idx_t = threadIdx.x;
+    const unsigned int val_size = val_pointers[idx_b + 1] - val_pointers[idx_b];

-        const int vals_per_block = dim * dim_wells;        // 12
-        const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
-        const int num_blocks_per_warp = 32/vals_per_block; // 2
-        const int lane = idx_t % 32;
-        const int c = lane % dim;                           // col in block
-        const int r = (lane / dim) % dim_wells;             // row in block
+    const int vals_per_block = dim * dim_wells;        // 12
+    const int num_active_threads = (32 / vals_per_block) * vals_per_block; // 24
+    const int num_blocks_per_warp = 32 / vals_per_block; // 2
+    const int lane = idx_t % 32;
+    const int c = lane % dim;                           // col in block
+    const int r = (lane / dim) % dim_wells;             // row in block

-        extern __shared__ double smem[];
-        double * __restrict__ z1 = smem;
-        double * __restrict__ z2 = z1 + dim_wells;
-
-        if(idx_t < dim_wells){
-            z1[idx_t] = 0.0;
-        }
-
-        __syncthreads();
-
-        // z1 = B * x
-        if(idx_t < num_active_threads){
-            // multiply all blocks with x
-            double temp = 0.0;
-            int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
-            while(b < val_size + val_pointers[idx_b]){
-                int colIdx = Bcols[b];
-                temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
-                b += num_blocks_per_warp;
-            }
-
-            // merge all blocks into 1 dim*dim_wells block
-            // since NORNE has only 2 parallel blocks, do not use a loop
-            temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
-
-            b = idx_t / vals_per_block + val_pointers[idx_b];
-
-            // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
-            if(idx_t < vals_per_block){
-                // should be a loop as well, now only works for dim == 3
-                if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);}  // add col 2 to col 0
-                if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);}  // add col 1 to col 0
-            }
-
-            // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
-            if(c == 0 && idx_t < vals_per_block){
-                z1[r] = temp;
-            }
-        }
-
-        __syncthreads();
-
-        // z2 = D^-1 * B * x = D^-1 * z1
-        if(idx_t < dim_wells){
-            double temp = 0.0;
-            for(int c = 0; c < dim_wells; ++c){
-                temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
-            }
-            z2[idx_t] = temp;
-        }
-
-        __syncthreads();
-
-        // y -= C^T * D^-1 * B * x
-        // use dim * val_size threads, each block is assigned 'dim' threads
-        if(idx_t < dim * val_size){
-            double temp = 0.0;
-            int b = idx_t / dim + val_pointers[idx_b];
-            int cc = idx_t % dim;
-            int colIdx = Ccols[b];
-            for(unsigned int c = 0; c < dim_wells; ++c){
-                temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
-            }
-            y[colIdx * dim + cc] -= temp;
-        }
+    extern __shared__ double smem[];
+    double * __restrict__ z1 = smem;
+    double * __restrict__ z2 = z1 + dim_wells;

+    if (idx_t < dim_wells) {
+        z1[idx_t] = 0.0;
    }

-    void WellContributions::alloc()
-    {
-        if (num_std_wells > 0) {
-            cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-            cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-            cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-            cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-            cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-            val_pointers = new unsigned int[num_std_wells + 1];
-            cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
-            cudaCheckLastError("apply_gpu malloc failed");
-            allocated = true;
+    __syncthreads();
+
+    // z1 = B * x
+    if (idx_t < num_active_threads) {
+        // multiply all blocks with x
+        double temp = 0.0;
+        int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
+        while (b < val_size + val_pointers[idx_b]) {
+            int colIdx = Bcols[b];
+            temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
+            b += num_blocks_per_warp;
+        }
+
+        // merge all blocks into 1 dim*dim_wells block
+        // since NORNE has only 2 parallel blocks, do not use a loop
+        temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
+
+        b = idx_t / vals_per_block + val_pointers[idx_b];
+
+        // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
+        if (idx_t < vals_per_block) {
+            // should be a loop as well, now only works for dim == 3
+            if (c == 0 || c == 2) {temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
+            if (c == 0 || c == 1) {temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
+        }
+
+        // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
+        if (c == 0 && idx_t < vals_per_block) {
+            z1[r] = temp;
        }
    }

-    WellContributions::~WellContributions()
-    {
-        // free pinned memory for MultisegmentWellContributions
-        if (h_x) {
-            cudaFreeHost(h_x);
-            cudaFreeHost(h_y);
-        }
+    __syncthreads();

-        // delete MultisegmentWellContributions
-        for (auto ms : multisegments) {
-            delete ms;
-        }
-        multisegments.clear();
-
-        // delete data for StandardWell
-        if (num_std_wells > 0) {
-            cudaFree(d_Cnnzs);
-            cudaFree(d_Dnnzs);
-            cudaFree(d_Bnnzs);
-            cudaFree(d_Ccols);
-            cudaFree(d_Bcols);
-            delete[] val_pointers;
-            cudaFree(d_val_pointers);
+    // z2 = D^-1 * B * x = D^-1 * z1
+    if (idx_t < dim_wells) {
+        double temp = 0.0;
+        for (int c = 0; c < dim_wells; ++c) {
+            temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
        }
+        z2[idx_t] = temp;
    }

+    __syncthreads();

-    // Apply the WellContributions, similar to StandardWell::apply()
-    // y -= (C^T *(D^-1*(   B*x)))
-    void WellContributions::apply(double *d_x, double *d_y)
-    {
-        // apply MultisegmentWells
+    // y -= C^T * D^-1 * B * x
+    // use dim * val_size threads, each block is assigned 'dim' threads
+    if (idx_t < dim * val_size) {
+        double temp = 0.0;
+        int b = idx_t / dim + val_pointers[idx_b];
+        int cc = idx_t % dim;
+        int colIdx = Ccols[b];
+        for (unsigned int c = 0; c < dim_wells; ++c) {
+            temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
+        }
+        y[colIdx * dim + cc] -= temp;
+    }

-        // make sure the stream is empty if timing measurements are done
+}
+
+void WellContributions::allocStandardWells()
+{
+    if (num_std_wells > 0) {
+        cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+        cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+        cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+        cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+        cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+        val_pointers = new unsigned int[num_std_wells + 1];
+        cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
+        cudaCheckLastError("apply_gpu malloc failed");
+        allocated = true;
+    }
+}
+
+void WellContributions::freeStandardWells() {
+    // delete data for StandardWell
+    if (num_std_wells > 0) {
+        cudaFree(d_Cnnzs);
+        cudaFree(d_Dnnzs);
+        cudaFree(d_Bnnzs);
+        cudaFree(d_Ccols);
+        cudaFree(d_Bcols);
+        delete[] val_pointers;
+        cudaFree(d_val_pointers);
+    }
+}
+
+
+// Apply the WellContributions, similar to StandardWell::apply()
+// y -= (C^T *(D^-1*(   B*x)))
+void WellContributions::apply(double *d_x, double *d_y)
+{
+    // apply MultisegmentWells
+
+    // make sure the stream is empty if timing measurements are done
+    cudaStreamSynchronize(stream);
+
+    if (num_ms_wells > 0) {
+        // allocate pinned memory on host if not yet done
+        if (h_x == nullptr) {
+            cudaMallocHost(&h_x, sizeof(double) * N);
+            cudaMallocHost(&h_y, sizeof(double) * N);
+            host_mem_cuda = true;
+        }
+
+        // copy vectors x and y from GPU to CPU
+        cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+        cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
        cudaStreamSynchronize(stream);

-        if (num_ms_wells > 0) {
-            // allocate pinned memory on host if not yet done
-            if (h_x == nullptr) {
-                cudaMallocHost(&h_x, sizeof(double) * N);
-                cudaMallocHost(&h_y, sizeof(double) * N);
-            }
-
-
-            // copy vectors x and y from GPU to CPU
-            cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-            cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-            cudaStreamSynchronize(stream);
-
-            // actually apply MultisegmentWells
-            for (MultisegmentWellContribution *well : multisegments) {
-                well->apply(h_x, h_y);
-            }
-
-            // copy vector y from CPU to GPU
-            cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
-            cudaStreamSynchronize(stream);
+        // actually apply MultisegmentWells
+        for (MultisegmentWellContribution *well : multisegments) {
+            well->apply(h_x, h_y);
        }

-        // apply StandardWells
-        if (num_std_wells > 0) {
-            int smem_size = 2 * sizeof(double) * dim_wells;
-            apply_well_contributions<<<num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
-        }
+        // copy vector y from CPU to GPU
+        cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
+        cudaStreamSynchronize(stream);
    }

+    // apply StandardWells
+    if (num_std_wells > 0) {
+        int smem_size = 2 * sizeof(double) * dim_wells;
+        apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+    }
+}

-    void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
-    {
-        if (!allocated) {
-            OPM_THROW(std::logic_error,"Error cannot add wellcontribution before allocating memory in WellContributions");
-        }

-        switch (type) {
-        case MatrixType::C:
-            cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-            cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-            break;
-        case MatrixType::D:
-            cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
-            break;
-        case MatrixType::B:
-            cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-            cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-            val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-            if(num_std_wells_so_far == num_std_wells - 1){
-                val_pointers[num_std_wells] = num_blocks;
-            }
-            cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells+1), cudaMemcpyHostToDevice);
-            break;
-        default:
-            OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
-        }
-        cudaCheckLastError("WellContributions::addMatrix() failed");
-        if (MatrixType::B == type) {
-            num_blocks_so_far += val_size;
-            num_std_wells_so_far++;
-        }
+void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+{
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

-    void WellContributions::setCudaStream(cudaStream_t stream_)
-    {
-        this->stream = stream_;
-        for(MultisegmentWellContribution *well : multisegments){
-            well->setCudaStream(stream_);
+    switch (type) {
+    case MatrixType::C:
+        cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        break;
+    case MatrixType::D:
+        cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
+        break;
+    case MatrixType::B:
+        cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+        if (num_std_wells_so_far == num_std_wells - 1) {
+            val_pointers[num_std_wells] = num_blocks;
        }
+        cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
+        break;
+    default:
+        OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
    }
+    cudaCheckLastError("WellContributions::addMatrix() failed");
+    if (MatrixType::B == type) {
+        num_blocks_so_far += val_size;
+        num_std_wells_so_far++;
+    }
+}

-    void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
-    {
-        dim = dim_;
-        dim_wells = dim_wells_;
-    }
-
-    void WellContributions::addNumBlocks(unsigned int nnz)
-    {
-        if (allocated) {
-            OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
-        }
-        num_blocks += nnz;
-        num_std_wells++;
-    }
-
-    void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
-        unsigned int Nb, unsigned int Mb,
-        unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
-        std::vector<double> &Cvalues)
-    {
-        this->N = Nb * dim;
-        MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
-        multisegments.emplace_back(well);
-        ++num_ms_wells;
+void WellContributions::setCudaStream(cudaStream_t stream_)
+{
+    this->stream = stream_;
+    for (MultisegmentWellContribution *well : multisegments) {
+        well->setCudaStream(stream_);
    }
+}

 } //namespace Opm

--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -20,139 +20,189 @@
 #ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
 #define WELLCONTRIBUTIONS_HEADER_INCLUDED

-#include <config.h>
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#endif

-#if ! HAVE_CUDA
-  #error "This file should only be included if CUDA is found"
+#if HAVE_OPENCL
+#include <opm/simulators/linalg/bda/opencl.hpp>
 #endif

 #include <vector>

 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>

-#include <cuda_runtime.h>

 namespace Opm
 {

-    /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
-    /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
-    /// So far, StandardWell and MultisegmentWell are supported
-    /// A single instance (or pointer) of this class is passed to the cusparseSolver.
-    /// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.
+/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
+/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
+/// So far, StandardWell and MultisegmentWell are supported
+/// StandardWells are only supported for cusparseSolver (CUDA), MultisegmentWells are supported for both cusparseSolver and openclSolver
+/// A single instance (or pointer) of this class is passed to the BdaSolver.
+/// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.

-    /// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
-    /// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
-    /// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
-    /// B*x and D*B*x are a vector with numStaticWellEq doubles
-    /// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
-    ///
-    /// This class is used in 3 phases:
-    /// - get total size of all wellcontributions that must be stored here
-    /// - allocate memory
-    /// - copy data of wellcontributions
-    class WellContributions
-    {
+/// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
+/// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
+/// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
+/// B*x and D*B*x are a vector with numStaticWellEq doubles
+/// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
+///
+/// This class is used in 3 phases:
+/// - get total size of all wellcontributions that must be stored here
+/// - allocate memory
+/// - copy data of wellcontributions
+class WellContributions
+{

-    private:
-        unsigned int num_blocks = 0;             // total number of blocks in all wells
-        unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
-        unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
-        unsigned int num_std_wells = 0;          // number of StandardWells in this object
-        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-        unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
-        unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
-        unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
-        unsigned int N;                          // number of rows (not blockrows) in vectors x and y
-        bool allocated = false;
-        std::vector<MultisegmentWellContribution*> multisegments;
-        cudaStream_t stream;
+public:

-        // data for StandardWells, could remain nullptrs if not used
-        double *d_Cnnzs = nullptr;
-        double *d_Dnnzs = nullptr;
-        double *d_Bnnzs = nullptr;
-        int *d_Ccols = nullptr;
-        int *d_Bcols = nullptr;
-        double *d_z1 = nullptr;
-        double *d_z2 = nullptr;
-        unsigned int *d_val_pointers = nullptr;
-
-        double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
-
-
-    public:
-
-        /// StandardWell has C, D and B matrices that need to be copied
-        enum class MatrixType {
-            C,
-            D,
-            B
-        };
-
-        /// Set a cudaStream to be used
-        /// \param[in] stream           the cudaStream that is used to launch the kernel in
-        void setCudaStream(cudaStream_t stream);
-
-        /// Create a new WellContributions, implementation is empty
-        WellContributions(){};
-
-        /// Destroy a WellContributions, and free memory
-        ~WellContributions();
-
-        /// Apply all Wells in this object
-        /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
-        /// \param[in] d_x        vector x, must be on GPU
-        /// \param[inout] d_y     vector y, must be on GPU
-        void apply(double *d_x, double *d_y);
-
-        /// Allocate memory for the StandardWells
-        void alloc();
-
-        /// Indicate how large the blocks of the StandardWell (C and B) are
-        /// \param[in] dim         number of columns
-        /// \param[in] dim_wells   number of rows
-        void setBlockSize(unsigned int dim, unsigned int dim_wells);
-
-        /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
-        /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
-        void addNumBlocks(unsigned int numBlocks);
-
-        /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
-        /// \param[in] type        indicate if C, D or B is sent
-        /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
-        /// \param[in] values      array of nonzeroes
-        /// \param[in] val_size    number of blocks in C or B, ignored for D
-        void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
-
-        /// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
-        /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
-        /// \param[in] dim              size of blocks in vectors x and y, equal to MultisegmentWell::numEq
-        /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
-        /// \param[in] Nb               number of blocks in vectors x and y
-        /// \param[in] Mb               number of blockrows in C, B and D
-        /// \param[in] BnumBlocks       number of blocks in C and B
-        /// \param[in] Bvalues          nonzero values of matrix B
-        /// \param[in] BcolIndices      columnindices of blocks of matrix B
-        /// \param[in] BrowPointers     rowpointers of matrix B
-        /// \param[in] DnumBlocks       number of blocks in D
-        /// \param[in] Dvalues          nonzero values of matrix D
-        /// \param[in] DcolPointers     columnpointers of matrix D
-        /// \param[in] DrowIndices      rowindices of matrix D
-        /// \param[in] Cvalues          nonzero values of matrix C
-        void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
-            unsigned int Nb, unsigned int Mb,
-            unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-            unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
-            std::vector<double> &Cvalues);
-
-        /// Return the number of wells added to this object
-        /// \return the number of wells added to this object
-        unsigned int getNumWells(){
-            return num_std_wells + num_ms_wells;
-        }
+    /// StandardWell has C, D and B matrices that need to be copied
+    enum class MatrixType {
+        C,
+        D,
+        B
    };

+private:
+    unsigned int num_blocks = 0;             // total number of blocks in all wells
+    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
+    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
+    unsigned int num_std_wells = 0;          // number of StandardWells in this object
+    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
+    unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
+    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
+    unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
+    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
+    bool allocated = false;
+    std::vector<MultisegmentWellContribution*> multisegments;
+
+#if HAVE_CUDA
+    cudaStream_t stream;
+#endif
+
+#if HAVE_OPENCL
+    cl::CommandQueue *queue = nullptr;
+#endif
+
+#if HAVE_CUDA
+    // data for StandardWells, could remain nullptrs if not used
+    // StandardWells are only supported for cusparseSolver now
+    double *d_Cnnzs = nullptr;
+    double *d_Dnnzs = nullptr;
+    double *d_Bnnzs = nullptr;
+    int *d_Ccols = nullptr;
+    int *d_Bcols = nullptr;
+    double *d_z1 = nullptr;
+    double *d_z2 = nullptr;
+    unsigned int *d_val_pointers = nullptr;
+#endif
+
+    double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+    bool host_mem_cuda = false;             // true iff h_x and h_y are allocated by cudaMallocHost(), so they need to be freed using cudaFreeHost()
+
+    int *toOrder = nullptr;
+    bool reorder = false;
+
+    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
+    /// \param[in] type        indicate if C, D or B is sent
+    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
+    /// \param[in] values      array of nonzeroes
+    /// \param[in] val_size    number of blocks in C or B, ignored for D
+    void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
+
+    /// Allocate GPU memory for StandardWells
+    void allocStandardWells();
+
+    /// Free GPU memory from StandardWells
+    void freeStandardWells();
+
+public:
+
+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used to launch the kernel in
+#if HAVE_CUDA
+    void setCudaStream(cudaStream_t stream);
+#endif
+
+    /// Create a new WellContributions, implementation is empty
+    WellContributions() {};
+
+    /// Destroy a WellContributions, and free memory
+    ~WellContributions();
+
+    /// Apply all Wells in this object
+    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
+    /// \param[in] d_x        vector x, must be on GPU
+    /// \param[inout] d_y     vector y, must be on GPU
+#if HAVE_CUDA
+    void apply(double *d_x, double *d_y);
+#endif
+#if HAVE_OPENCL
+    void apply(cl::Buffer& x, cl::Buffer& y);
+#endif
+
+    /// Allocate memory for the StandardWells
+    void alloc();
+
+    /// Indicate how large the blocks of the StandardWell (C and B) are
+    /// \param[in] dim         number of columns
+    /// \param[in] dim_wells   number of rows
+    void setBlockSize(unsigned int dim, unsigned int dim_wells);
+
+    /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
+    /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
+    void addNumBlocks(unsigned int numBlocks);
+
+    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
+    /// \param[in] type        indicate if C, D or B is sent
+    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
+    /// \param[in] values      array of nonzeroes
+    /// \param[in] val_size    number of blocks in C or B, ignored for D
+    void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
+
+    /// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
+    /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
+    /// \param[in] dim              size of blocks in vectors x and y, equal to MultisegmentWell::numEq
+    /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
+    /// \param[in] Nb               number of blocks in vectors x and y
+    /// \param[in] Mb               number of blockrows in C, B and D
+    /// \param[in] BnumBlocks       number of blocks in C and B
+    /// \param[in] Bvalues          nonzero values of matrix B
+    /// \param[in] BcolIndices      columnindices of blocks of matrix B
+    /// \param[in] BrowPointers     rowpointers of matrix B
+    /// \param[in] DnumBlocks       number of blocks in D
+    /// \param[in] Dvalues          nonzero values of matrix D
+    /// \param[in] DcolPointers     columnpointers of matrix D
+    /// \param[in] DrowIndices      rowindices of matrix D
+    /// \param[in] Cvalues          nonzero values of matrix C
+    void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+                                         unsigned int Nb, unsigned int Mb,
+                                         unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+                                         unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+                                         std::vector<double> &Cvalues);
+
+    /// Return the number of wells added to this object
+    /// \return the number of wells added to this object
+    unsigned int getNumWells() {
+        return num_std_wells + num_ms_wells;
+    }
+
+
+    /// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether the columnindices need to be reordered or not
+    void setReordering(int *toOrder, bool reorder);
+
+#if HAVE_OPENCL
+    /// This object copies some cl::Buffers, it requires a CommandQueue to do that
+    /// \param[in] queue      the opencl commandqueue to be used
+    void setOpenCLQueue(cl::CommandQueue *queue);
+#endif
+};
+
 } //namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/cuda_header.hpp
+++ b/opm/simulators/linalg/bda/cuda_header.hpp
@ -21,7 +21,10 @@
 #define CUDA_HEADER_HEADER_INCLUDED

 #include <cuda_runtime.h>
-#include <iostream>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>

 /// Runtime error checking of CUDA functions
 /// Usage:
@ -33,9 +36,10 @@
 inline void __cudaCheckError(const char *file, const int line, const char *msg){
    cudaError err = cudaGetLastError();
    if (cudaSuccess != err){
-        std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
-        std::cerr << "BDA error message: " << msg << std::endl;
-        exit(1);
+        std::ostringstream out;
+        out << cudaGetErrorString(err) << "\n";
+        out << "BDA error message: " << msg << "\n";
+        OPM_THROW(std::logic_error, out.str());
    }
 }

--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -17,18 +17,13 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __NVCC__
-    #error "Cannot compile for cusparse: NVIDIA compiler not found"
-#endif
+#include <config.h>

-#include <cstdio>
-#include <cstdlib>
 #include <cuda_runtime.h>
-#include <iostream>
-#include <sys/time.h>
 #include <sstream>

 #include <opm/common/OpmLog/OpmLog.hpp>
+#include <dune/common/timer.hh>

 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
@ -42,490 +37,478 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-namespace Opm
+namespace bda
 {

-    const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-    const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+using Opm::OpmLog;
+using Dune::Timer;

-    double second(void) {
-        struct timeval tv;
-        gettimeofday(&tv, nullptr);
-        return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+
+
+template <unsigned int block_size>
+cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {}
+
+template <unsigned int block_size>
+cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
+    finalize();
+}
+
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
+    int n = N;
+    double rho = 1.0, rhop;
+    double alpha, nalpha, beta;
+    double omega, nomega, tmp1, tmp2;
+    double norm, norm_0;
+    double zero = 0.0;
+    double one  = 1.0;
+    double mone = -1.0;
+    float it;
+
+    if (wellContribs.getNumWells() > 0) {
+        wellContribs.setCudaStream(stream);
    }

-    cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0) {
+    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
+
+    cublasDscal(cublasHandle, n, &mone, d_r, 1);
+    cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
+    cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
+    cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
+    cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+
+    if (verbosity > 1) {
+        std::ostringstream out;
+        out << std::scientific << "cusparseSolver initial norm: " << norm_0;
+        OpmLog::info(out.str());
    }

-    cusparseSolverBackend::~cusparseSolverBackend() {
-        finalize();
-    }
+    for (it = 0.5; it < maxit; it += 0.5) {
+        rhop = rho;
+        cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);

-    void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
-        double t_total1, t_total2;
-        int n = N;
-        double rho = 1.0, rhop;
-        double alpha, nalpha, beta;
-        double omega, nomega, tmp1, tmp2;
-        double norm, norm_0;
-        double zero = 0.0;
-        double one  = 1.0;
-        double mone = -1.0;
-        float it;
-
-        t_total1 = second();
-
-        if(wellContribs.getNumWells() > 0){
-            wellContribs.setCudaStream(stream);
+        if (it > 1) {
+            beta = (rho / rhop) * (alpha / omega);
+            nomega = -omega;
+            cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
+            cublasDscal(cublasHandle, n, &beta, d_p, 1);
+            cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
        }

-        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
+        // apply ilu0
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);

-        cublasDscal(cublasHandle, n, &mone, d_r, 1);
-        cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
-        cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
-        cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); 
-        cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+        // spmv
+        cusparseDbsrmv(cusparseHandle, order, \
+                       operation, Nb, Nb, nnzb, \
+                       &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            wellContribs.apply(d_pw, d_v);
+        }
+
+        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
+        alpha = rho / tmp1;
+        nalpha = -alpha;
+        cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
+        cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
+        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }
+
+        it += 0.5;
+
+        // apply ilu0
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
+
+        // spmv
+        cusparseDbsrmv(cusparseHandle, order, \
+                       operation, Nb, Nb, nnzb, &one, descr_M, \
+                       d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            wellContribs.apply(d_s, d_t);
+        }
+
+        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
+        cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
+        omega = tmp1 / tmp2;
+        nomega = -omega;
+        cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
+        cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
+
+        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }

        if (verbosity > 1) {
            std::ostringstream out;
-            out << std::scientific << "cusparseSolver initial norm: " << norm_0;
-            OpmLog::info(out.str());
-        }
-
-        for (it = 0.5; it < maxit; it+=0.5) {
-            rhop = rho;
-            cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
-
-            if (it > 1) {
-                beta = (rho/rhop) * (alpha/omega);
-                nomega = -omega;
-                cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
-                cublasDscal(cublasHandle, n, &beta, d_p, 1);
-                cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
-            }
-
-            // apply ilu0
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
-
-            // spmv
-            cusparseDbsrmv(cusparseHandle, order, \
-                operation, Nb, Nb, nnzb, \
-                &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
-
-            // apply wellContributions
-            if(wellContribs.getNumWells() > 0){
-                wellContribs.apply(d_pw, d_v);
-            }
-
-            cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
-            alpha = rho / tmp1;
-            nalpha = -alpha;
-            cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
-            cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
-            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
-
-            if (norm < tolerance * norm_0 && it > minit) {
-                break;
-            }
-
-            it += 0.5;
-
-            // apply ilu0
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
-
-            // spmv
-            cusparseDbsrmv(cusparseHandle, order, \
-                operation, Nb, Nb, nnzb, &one, descr_M, \
-                d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
-
-            // apply wellContributions
-            if(wellContribs.getNumWells() > 0){
-                wellContribs.apply(d_s, d_t);
-            }
-
-            cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
-            cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
-            omega = tmp1 / tmp2;
-            nomega = -omega;
-            cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
-            cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
-
-            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
-
-
-            if (norm < tolerance * norm_0 && it > minit) {
-                break;
-            }
-
-            if (verbosity > 1) {
-                std::ostringstream out;
-                out << "it: " << it << std::scientific << ", norm: " << norm;
-                OpmLog::info(out.str());
-            }
-        }
-
-        t_total2 = second();
-
-        res.iterations = std::min(it, (float)maxit);
-        res.reduction = norm/norm_0;
-        res.conv_rate  = static_cast<double>(pow(res.reduction,1.0/it));
-        res.elapsed = t_total2 - t_total1;
-        res.converged = (it != (maxit + 0.5));
-
-        if (verbosity > 0) {
-            std::ostringstream out;
-            out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-                   ", time per iteration: " << res.elapsed/it << ", iterations: " << it;
+            out << "it: " << it << std::scientific << ", norm: " << norm;
            OpmLog::info(out.str());
        }
    }

+    res.iterations = std::min(it, (float)maxit);
+    res.reduction = norm / norm_0;
+    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / it));
+    res.elapsed = t_total.stop();
+    res.converged = (it != (maxit + 0.5));

-    void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
-        this->N = N;
-        this->nnz = nnz;
-        this->block_size = dim;
-        this->nnzb = nnz/block_size/block_size;
-        Nb = (N + dim - 1) / dim;
+    if (verbosity > 0) {
        std::ostringstream out;
-        out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
+        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
-        out.str("");
-        out.clear();
-        out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
-        OpmLog::info(out.str());
-
-        int deviceID = 0;
-        cudaSetDevice(deviceID);
-        cudaCheckLastError("Could not get device");
-        struct cudaDeviceProp props;
-        cudaGetDeviceProperties(&props, deviceID);
-        cudaCheckLastError("Could not get device properties");
-        out.str("");
-        out.clear();
-        out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
-        OpmLog::info(out.str());
-
-        cudaStreamCreate(&stream);
-        cudaCheckLastError("Could not create stream");
-
-        cublasCreate(&cublasHandle);
-        cudaCheckLastError("Could not create cublasHandle");
-
-        cusparseCreate(&cusparseHandle);
-        cudaCheckLastError("Could not create cusparseHandle");
-
-        cudaMalloc((void**)&d_x, sizeof(double) * N);
-        cudaMalloc((void**)&d_b, sizeof(double) * N);
-        cudaMalloc((void**)&d_r, sizeof(double) * N);
-        cudaMalloc((void**)&d_rw,sizeof(double) * N);
-        cudaMalloc((void**)&d_p, sizeof(double) * N);
-        cudaMalloc((void**)&d_pw,sizeof(double) * N);
-        cudaMalloc((void**)&d_s, sizeof(double) * N);
-        cudaMalloc((void**)&d_t, sizeof(double) * N);
-        cudaMalloc((void**)&d_v, sizeof(double) * N);
-        cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
-        cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
-        cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
-        cudaCheckLastError("Could not allocate enough memory on GPU");
-
-        cublasSetStream(cublasHandle, stream);
-        cudaCheckLastError("Could not set stream to cublas");
-        cusparseSetStream(cusparseHandle, stream);
-        cudaCheckLastError("Could not set stream to cusparse");
-
-#if COPY_ROW_BY_ROW
-        cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
-        cudaCheckLastError("Could not allocate pinned memory");
-#endif
-
-        initialized = true;
-    } // end initialize()
-
-    void cusparseSolverBackend::finalize() {
-        if (initialized) {
-            cudaFree(d_x);
-            cudaFree(d_b);
-            cudaFree(d_r);
-            cudaFree(d_rw);
-            cudaFree(d_p);
-            cudaFree(d_pw);
-            cudaFree(d_s);
-            cudaFree(d_t);
-            cudaFree(d_v);
-            cudaFree(d_mVals);
-            cudaFree(d_bVals);
-            cudaFree(d_bCols);
-            cudaFree(d_bRows);
-            cudaFree(d_buffer);
-            cusparseDestroyBsrilu02Info(info_M);
-            cusparseDestroyBsrsv2Info(info_L);
-            cusparseDestroyBsrsv2Info(info_U);
-            cusparseDestroyMatDescr(descr_B);
-            cusparseDestroyMatDescr(descr_M);
-            cusparseDestroyMatDescr(descr_L);
-            cusparseDestroyMatDescr(descr_U);
-            cusparseDestroy(cusparseHandle);
-            cublasDestroy(cublasHandle);
-#if COPY_ROW_BY_ROW
-            cudaFreeHost(vals_contiguous);
-#endif
-            cudaStreamDestroy(stream);
-        }
-    } // end finalize()
-
-
-    void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-#if COPY_ROW_BY_ROW
-        int sum = 0;
-        for(int i = 0; i < Nb; ++i){
-            int size_row = rows[i+1] - rows[i];
-            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
-            sum += size_row * block_size * block_size;
-        }
-        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#else
-        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#endif
-
-        cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-    } // end copy_system_to_gpu()
-
-
-    // don't copy rowpointers and colindices, they stay the same
-    void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-#if COPY_ROW_BY_ROW
-        int sum = 0;
-        for(int i = 0; i < Nb; ++i){
-            int size_row = rows[i+1] - rows[i];
-            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
-            sum += size_row * block_size * block_size;
-        }
-        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#else
-        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#endif
-
-        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-    } // end update_system_on_gpu()
-
-
-    void cusparseSolverBackend::reset_prec_on_gpu() {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
    }
-
-
-    bool cusparseSolverBackend::analyse_matrix() {
-
-        int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
-        double t1, t2;
-
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-        cusparseCreateMatDescr(&descr_B);
-        cusparseCreateMatDescr(&descr_M);
-        cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
-        const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;     // matrices from Flow are base0
-
-        cusparseSetMatIndexBase(descr_B, base_type);
-        cusparseSetMatIndexBase(descr_M, base_type);
-
-        cusparseCreateMatDescr(&descr_L);
-        cusparseSetMatIndexBase(descr_L, base_type);
-        cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
-        cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
-
-        cusparseCreateMatDescr(&descr_U);
-        cusparseSetMatIndexBase(descr_U, base_type);
-        cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
-        cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
-        cudaCheckLastError("Could not initialize matrix descriptions");
-
-        cusparseCreateBsrilu02Info(&info_M);
-        cusparseCreateBsrsv2Info(&info_L);
-        cusparseCreateBsrsv2Info(&info_U);
-        cudaCheckLastError("Could not create analysis info");
-
-        cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-            descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
-        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
-        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
-        cudaCheckLastError();
-        d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
-        
-        cudaMalloc((void**)&d_buffer, d_bufferSize);
-
-        // analysis of ilu LU decomposition
-        cusparseDbsrilu02_analysis(cusparseHandle, order, \
-            Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
-            block_size, info_M, policy, d_buffer);
-
-        int structural_zero;
-        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
-        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
-            return false;
-        }
-
-        // analysis of ilu apply
-        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-            Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
-            block_size, info_L, policy, d_buffer);
-
-        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-            Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
-            block_size, info_U, policy, d_buffer);
-        cudaCheckLastError("Could not analyse level information");
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-
-        analysis_done = true;
-
-        return true;
-    } // end analyse_matrix()
-
-    bool cusparseSolverBackend::create_preconditioner() {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-        d_mCols = d_bCols;
-        d_mRows = d_bRows;
-        cusparseDbsrilu02(cusparseHandle, order, \
-            Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
-            block_size, info_M, policy, d_buffer);
-        cudaCheckLastError("Could not perform ilu decomposition");
-
-        int structural_zero;
-        // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
-        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
-        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
-            return false;
-        }
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-        return true;
-    } // end create_preconditioner()
-
-
-    void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
-        // actually solve
-        gpu_pbicgstab(wellContribs, res);
-        cudaStreamSynchronize(stream);
-        cudaCheckLastError("Something went wrong during the GPU solve");
-    } // end solve_system()
-
-
-    // copy result to host memory
-    // caller must be sure that x is a valid array
-    void cusparseSolverBackend::post_process(double *x) {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-        cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
-
-        if (verbosity > 2) {
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::post_process(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-    } // end post_process()
-
-
-    typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
-
-    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { 
-        if (initialized == false) {
-            initialize(N, nnz, dim);
-            copy_system_to_gpu(vals, rows, cols, b);
-        }else{
-            update_system_on_gpu(vals, rows, b);
-        }
-        if (analysis_done == false) {
-            if (!analyse_matrix()) {
-                return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
-            }
-        }
-        reset_prec_on_gpu();
-        if (create_preconditioner()) {
-            solve_system(wellContribs, res);
-        }else{
-            return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
-        }
-        return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
-    }
-
-
 }


+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
+    this->N = N;
+    this->nnz = nnz;
+    this->nnzb = nnz / block_size / block_size;
+    Nb = (N + dim - 1) / dim;
+    std::ostringstream out;
+    out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
+    OpmLog::info(out.str());
+    out.str("");
+    out.clear();
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+    OpmLog::info(out.str());
+
+    cudaSetDevice(deviceID);
+    cudaCheckLastError("Could not get device");
+    struct cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, deviceID);
+    cudaCheckLastError("Could not get device properties");
+    out.str("");
+    out.clear();
+    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+    OpmLog::info(out.str());
+
+    cudaStreamCreate(&stream);
+    cudaCheckLastError("Could not create stream");
+
+    cublasCreate(&cublasHandle);
+    cudaCheckLastError("Could not create cublasHandle");
+
+    cusparseCreate(&cusparseHandle);
+    cudaCheckLastError("Could not create cusparseHandle");
+
+    cudaMalloc((void**)&d_x, sizeof(double) * N);
+    cudaMalloc((void**)&d_b, sizeof(double) * N);
+    cudaMalloc((void**)&d_r, sizeof(double) * N);
+    cudaMalloc((void**)&d_rw, sizeof(double) * N);
+    cudaMalloc((void**)&d_p, sizeof(double) * N);
+    cudaMalloc((void**)&d_pw, sizeof(double) * N);
+    cudaMalloc((void**)&d_s, sizeof(double) * N);
+    cudaMalloc((void**)&d_t, sizeof(double) * N);
+    cudaMalloc((void**)&d_v, sizeof(double) * N);
+    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb + 1));
+    cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+    cudaCheckLastError("Could not allocate enough memory on GPU");
+
+    cublasSetStream(cublasHandle, stream);
+    cudaCheckLastError("Could not set stream to cublas");
+    cusparseSetStream(cusparseHandle, stream);
+    cudaCheckLastError("Could not set stream to cusparse");
+
+#if COPY_ROW_BY_ROW
+    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+    cudaCheckLastError("Could not allocate pinned memory");
+#endif
+
+    initialized = true;
+} // end initialize()
+
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::finalize() {
+    if (initialized) {
+        cudaFree(d_x);
+        cudaFree(d_b);
+        cudaFree(d_r);
+        cudaFree(d_rw);
+        cudaFree(d_p);
+        cudaFree(d_pw);
+        cudaFree(d_s);
+        cudaFree(d_t);
+        cudaFree(d_v);
+        cudaFree(d_mVals);
+        cudaFree(d_bVals);
+        cudaFree(d_bCols);
+        cudaFree(d_bRows);
+        cudaFree(d_buffer);
+        cusparseDestroyBsrilu02Info(info_M);
+        cusparseDestroyBsrsv2Info(info_L);
+        cusparseDestroyBsrsv2Info(info_U);
+        cusparseDestroyMatDescr(descr_B);
+        cusparseDestroyMatDescr(descr_M);
+        cusparseDestroyMatDescr(descr_L);
+        cusparseDestroyMatDescr(descr_U);
+        cusparseDestroy(cusparseHandle);
+        cublasDestroy(cublasHandle);
+#if COPY_ROW_BY_ROW
+        cudaFreeHost(vals_contiguous);
+#endif
+        cudaStreamDestroy(stream);
+    }
+} // end finalize()
+
+
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
+    Timer t;
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rows[i + 1] - rows[i];
+        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#else
+    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#endif
+
+    cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        std::ostringstream out;
+        out << "cusparseSolver::copy_system_to_gpu(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end copy_system_to_gpu()
+
+
+// don't copy rowpointers and colindices, they stay the same
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
+    Timer t;
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rows[i + 1] - rows[i];
+        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#else
+    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#endif
+
+    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        std::ostringstream out;
+        out << "cusparseSolver::update_system_on_gpu(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system_on_gpu()
+
+
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
+    cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+}
+
+
+template <unsigned int block_size>
+bool cusparseSolverBackend<block_size>::analyse_matrix() {
+
+    int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
+    Timer t;
+
+    cusparseCreateMatDescr(&descr_B);
+    cusparseCreateMatDescr(&descr_M);
+    cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
+    const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;     // matrices from Flow are base0
+
+    cusparseSetMatIndexBase(descr_B, base_type);
+    cusparseSetMatIndexBase(descr_M, base_type);
+
+    cusparseCreateMatDescr(&descr_L);
+    cusparseSetMatIndexBase(descr_L, base_type);
+    cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+    cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
+
+    cusparseCreateMatDescr(&descr_U);
+    cusparseSetMatIndexBase(descr_U, base_type);
+    cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
+    cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
+    cudaCheckLastError("Could not initialize matrix descriptions");
+
+    cusparseCreateBsrilu02Info(&info_M);
+    cusparseCreateBsrsv2Info(&info_L);
+    cusparseCreateBsrsv2Info(&info_U);
+    cudaCheckLastError("Could not create analysis info");
+
+    cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
+                                 descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+                               descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+                               descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
+    cudaCheckLastError();
+    d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+
+    cudaMalloc((void**)&d_buffer, d_bufferSize);
+
+    // analysis of ilu LU decomposition
+    cusparseDbsrilu02_analysis(cusparseHandle, order, \
+                               Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+                               block_size, info_M, policy, d_buffer);
+
+    int structural_zero;
+    cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+    if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+        return false;
+    }
+
+    // analysis of ilu apply
+    cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+                             Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+                             block_size, info_L, policy, d_buffer);
+
+    cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+                             Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+                             block_size, info_U, policy, d_buffer);
+    cudaCheckLastError("Could not analyse level information");
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        std::ostringstream out;
+        out << "cusparseSolver::analyse_matrix(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    analysis_done = true;
+
+    return true;
+} // end analyse_matrix()
+
+template <unsigned int block_size>
+bool cusparseSolverBackend<block_size>::create_preconditioner() {
+    Timer t;
+
+    d_mCols = d_bCols;
+    d_mRows = d_bRows;
+    cusparseDbsrilu02(cusparseHandle, order, \
+                      Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+                      block_size, info_M, policy, d_buffer);
+    cudaCheckLastError("Could not perform ilu decomposition");
+
+    int structural_zero;
+    // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
+    cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+    if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+        return false;
+    }
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        std::ostringstream out;
+        out << "cusparseSolver::create_preconditioner(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    return true;
+} // end create_preconditioner()
+
+
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
+    // actually solve
+    gpu_pbicgstab(wellContribs, res);
+    cudaStreamSynchronize(stream);
+    cudaCheckLastError("Something went wrong during the GPU solve");
+} // end solve_system()
+
+
+// copy result to host memory
+// caller must be sure that x is a valid array
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::get_result(double *x) {
+    Timer t;
+
+    cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "cusparseSolver::get_result(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end get_result()
+
+
+
+template <unsigned int block_size>
+SolverStatus cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+    if (initialized == false) {
+        initialize(N, nnz, dim);
+        copy_system_to_gpu(vals, rows, cols, b);
+    } else {
+        update_system_on_gpu(vals, rows, b);
+    }
+    if (analysis_done == false) {
+        if (!analyse_matrix()) {
+            return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+        }
+    }
+    reset_prec_on_gpu();
+    if (create_preconditioner()) {
+        solve_system(wellContribs, res);
+    } else {
+        return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+    }
+    return SolverStatus::BDA_SOLVER_SUCCESS;
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                       \
+template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda
+
+
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@ -24,21 +24,31 @@
 #include "cublas_v2.h"
 #include "cusparse_v2.h"

-#include "opm/simulators/linalg/bda/BdaResult.hpp"
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-namespace Opm
+namespace bda
 {

 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-class cusparseSolverBackend{
+template <unsigned int block_size>
+class cusparseSolverBackend : public BdaSolver<block_size> {
+
+    typedef BdaSolver<block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::deviceID;
+    using Base::maxit;
+    using Base::tolerance;
+    using Base::initialized;

 private:

-    int minit;
-    int maxit;
-    double tolerance;
-
    cublasHandle_t cublasHandle;
    cusparseHandle_t cusparseHandle;
    cudaStream_t stream;
@ -49,24 +59,13 @@ private:
    double *d_bVals, *d_mVals;
    int *d_bCols, *d_mCols;
    int *d_bRows, *d_mRows;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;
+    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
    double *d_pw, *d_s, *d_t, *d_v;
    void *d_buffer;
-    int N, Nb, nnz, nnzb;
-    double *vals_contiguous;
+    double *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp

-    int block_size;
-
-    bool initialized = false;
    bool analysis_done = false;

-    // verbosity
-    // 0: print nothing during solves, only when initializing
-    // 1: print number of iterations and final norm
-    // 2: also print norm each iteration
-    // 3: also print timings of different backend functions
-
-    int verbosity = 0;

    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
@ -113,18 +112,13 @@ private:

 public:

-    enum class cusparseSolverStatus {
-        CUSPARSE_SOLVER_SUCCESS,
-        CUSPARSE_SOLVER_ANALYSIS_FAILED,
-        CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
-        CUSPARSE_SOLVER_UNKNOWN_ERROR
-    };

    /// Construct a cusparseSolver
    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
-    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
+    /// \param[in] deviceID                   the device to be used
+    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);

    /// Destroy a cusparseSolver, and free memory
    ~cusparseSolverBackend();
@ -140,15 +134,15 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
+    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;

-    /// Post processing after linear solve, now only copies resulting x vector back
+    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
-    void post_process(double *x);
+    void get_result(double *x) override;

 }; // end class cusparseSolverBackend

-}
+} // namespace bda

 #endif

--- a/opm/simulators/linalg/bda/opencl.cpp
+++ b/opm/simulators/linalg/bda/opencl.cpp
@ -0,0 +1,93 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+#include <string>
+
+/// Translate OpenCL error codes to strings
+/// Integer - String combinations are defined in CL/cl.h
+/// \param[in] error     error code
+std::string getErrorString(cl_int error)
+{
+    switch (error)
+    {
+        case 0: return "CL_SUCCESS";
+        case -1: return "CL_DEVICE_NOT_FOUND";
+        case -2: return "CL_DEVICE_NOT_AVAILABLE";
+        case -3: return "CL_COMPILER_NOT_AVAILABLE";
+        case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        case -5: return "CL_OUT_OF_RESOURCES";
+        case -6: return "CL_OUT_OF_HOST_MEMORY";
+        case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+        case -8: return "CL_MEM_COPY_OVERLAP";
+        case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+        case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+        case -11: return "CL_BUILD_PROGRAM_FAILURE, kernels in openclKernels.hpp cannot be compiled";
+        case -12: return "CL_MAP_FAILURE";
+        case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+        case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+        case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+        case -16: return "CL_LINKER_NOT_AVAILABLE";
+        case -17: return "CL_LINK_PROGRAM_FAILURE";
+        case -18: return "CL_DEVICE_PARTITION_FAILED";
+        case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+        case -30: return "CL_INVALID_VALUE";
+        case -31: return "CL_INVALID_DEVICE_TYPE";
+        case -32: return "CL_INVALID_PLATFORM";
+        case -33: return "CL_INVALID_DEVICE";
+        case -34: return "CL_INVALID_CONTEXT";
+        case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+        case -36: return "CL_INVALID_COMMAND_QUEUE";
+        case -37: return "CL_INVALID_HOST_PTR";
+        case -38: return "CL_INVALID_MEM_OBJECT";
+        case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+        case -40: return "CL_INVALID_IMAGE_SIZE";
+        case -41: return "CL_INVALID_SAMPLER";
+        case -42: return "CL_INVALID_BINARY";
+        case -43: return "CL_INVALID_BUILD_OPTIONS";
+        case -44: return "CL_INVALID_PROGRAM";
+        case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+        case -46: return "CL_INVALID_KERNEL_NAME";
+        case -47: return "CL_INVALID_KERNEL_DEFINITION";
+        case -48: return "CL_INVALID_KERNEL";
+        case -49: return "CL_INVALID_ARG_INDEX";
+        case -50: return "CL_INVALID_ARG_VALUE";
+        case -51: return "CL_INVALID_ARG_SIZE";
+        case -52: return "CL_INVALID_KERNEL_ARGS";
+        case -53: return "CL_INVALID_WORK_DIMENSION";
+        case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+        case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+        case -56: return "CL_INVALID_GLOBAL_OFFSET";
+        case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+        case -58: return "CL_INVALID_EVENT";
+        case -59: return "CL_INVALID_OPERATION";
+        case -60: return "CL_INVALID_GL_OBJECT";
+        case -61: return "CL_INVALID_BUFFER_SIZE";
+        case -62: return "CL_INVALID_MIP_LEVEL";
+        case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+        case -64: return "CL_INVALID_PROPERTY";
+        case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+        case -66: return "CL_INVALID_COMPILER_OPTIONS";
+        case -67: return "CL_INVALID_LINKER_OPTIONS";
+        case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+        default: return "UNKNOWN_CL_CODE";
+    }
+}
--- a/opm/simulators/linalg/bda/opencl.hpp
+++ b/opm/simulators/linalg/bda/opencl.hpp
@ -0,0 +1,32 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/// This file includes the relevant OpenCL header(s)
+/// All bda files using OpenCL declarations should include this header
+
+#define __CL_ENABLE_EXCEPTIONS
+#define CL_TARGET_OPENCL_VERSION 120   // indicate OpenCL 1.2 is used
+#include <CL/cl.hpp>                   // supports up to OpenCL 1.2
+
+#include <string>
+
+/// Translate OpenCL error codes to strings
+/// Integer - String combinations are defined in CL/cl.h
+/// \param[in] error     error code
+std::string getErrorString(cl_int error);
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@ -0,0 +1,373 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPENCL_HPP
+#define OPENCL_HPP
+
+namespace bda
+{
+
+    const char* axpy_s  = R"(
+    __kernel void axpy(
+        __global double *in,
+        const double a,
+        __global double *out,
+        const int N)
+    {
+        unsigned int NUM_THREADS = get_global_size(0);
+        int idx = get_global_id(0);
+
+        while(idx < N){
+            out[idx] += a * in[idx];
+            idx += NUM_THREADS;
+        }
+    }
+    )";
+
+
+    // returns partial sums, instead of the final dot product
+    const char* dot_1_s = R"(
+    __kernel void dot_1(
+        __global double *in1,
+        __global double *in2,
+        __global double *out,
+        const unsigned int N,
+        __local double *tmp)
+    {
+        unsigned int tid = get_local_id(0);
+        unsigned int bsize = get_local_size(0);
+        unsigned int bid = get_global_id(0) / bsize;
+        unsigned int i = get_global_id(0);
+        unsigned int NUM_THREADS = get_global_size(0);
+
+        double sum = 0.0;
+        while(i < N){
+            sum += in1[i] * in2[i];
+            i += NUM_THREADS;
+        }
+        tmp[tid] = sum;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // do reduction in shared mem
+        for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+        {
+            if (tid < s)
+            {
+                tmp[tid] += tmp[tid + s];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        // write result for this block to global mem
+        if (tid == 0) out[get_group_id(0)] = tmp[0];
+    }
+    )";
+
+
+    // returns partial sums, instead of the final norm
+    // the square root must be computed on CPU
+    const char* norm_s = R"(
+    __kernel void norm(
+        __global double *in,
+        __global double *out,
+        const unsigned int N,
+        __local double *tmp)
+    {
+        unsigned int tid = get_local_id(0);
+        unsigned int bsize = get_local_size(0);
+        unsigned int bid = get_global_id(0) / bsize;
+        unsigned int i = get_global_id(0);
+        unsigned int NUM_THREADS = get_global_size(0);
+
+        double local_sum = 0.0;
+        while(i < N){
+            local_sum += in[i] * in[i];
+            i += NUM_THREADS;
+        }
+        tmp[tid] = local_sum;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // do reduction in shared mem
+        for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+        {
+            if (tid < s)
+            {
+                tmp[tid] += tmp[tid + s];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        // write result for this block to global mem
+        if (tid == 0) out[get_group_id(0)] = tmp[0];
+    }
+    )";
+
+
+    // p = (p - omega * v) * beta + r
+    const char* custom_s = R"(
+    __kernel void custom(
+        __global double *p,
+        __global double *v,
+        __global double *r,
+        const double omega,
+        const double beta,
+        const int N)
+    {
+        const unsigned int NUM_THREADS = get_global_size(0);
+        unsigned int idx = get_global_id(0);
+
+        while(idx < N){
+            double res = p[idx];
+            res -= omega * v[idx];
+            res *= beta;
+            res += r[idx];
+            p[idx] = res;
+            idx += NUM_THREADS;
+        }
+    }
+    )";
+
+
+    // b = mat * x
+    // algorithm based on:
+    // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+    // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+    const char* spmv_blocked_s = R"(
+    __kernel void spmv_blocked(
+        __global const double *vals,
+        __global const int *cols,
+        __global const int *rows,
+        const int Nb,
+        __global const double *x,
+        __global double *b,
+        const unsigned int block_size,
+        __local double *tmp)
+    {
+        const unsigned int warpsize = 32;
+        const unsigned int bsize = get_local_size(0);
+        const unsigned int idx_b = get_global_id(0) / bsize;
+        const unsigned int idx_t = get_local_id(0);
+        unsigned int idx = idx_b * bsize + idx_t;
+        const unsigned int bs = block_size;
+        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+        const unsigned int NUM_THREADS = get_global_size(0);
+        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+        unsigned int target_block_row = idx / warpsize;
+        const unsigned int lane = idx_t % warpsize;
+        const unsigned int c = (lane / bs) % bs;
+        const unsigned int r = lane % bs;
+
+        // for 3x3 blocks:
+        // num_active_threads: 27
+        // num_blocks_per_warp: 3
+
+        while(target_block_row < Nb){
+            unsigned int first_block = rows[target_block_row];
+            unsigned int last_block = rows[target_block_row+1];
+            unsigned int block = first_block + lane / (bs*bs);
+            double local_out = 0.0;
+
+            if(lane < num_active_threads){
+                for(; block < last_block; block += num_blocks_per_warp){
+                    double x_elem = x[cols[block]*bs + c];
+                    double A_elem = vals[block*bs*bs + c + r*bs];
+                    local_out += x_elem * A_elem;
+                }
+            }
+
+            // do reduction in shared mem
+            tmp[lane] = local_out;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+            {
+                if (lane + offset < warpsize)
+                {
+                    tmp[lane] += tmp[lane + offset];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if(lane < bs){
+                unsigned int row = target_block_row*bs + lane;
+                b[row] = tmp[lane];
+            }
+            target_block_row += num_warps_in_grid;
+        }
+    }
+    )";
+
+
+
+    // ILU apply part 1: forward substitution
+    // solves L*x=y where L is a lower triangular sparse blocked matrix
+    const char* ILU_apply1_s = R"(
+    __kernel void ILU_apply1(
+        __global const double *Lvals,
+        __global const unsigned int *Lcols,
+        __global const unsigned int *Lrows,
+        const unsigned int LrowSize,
+        __global const double *y,
+        __global double *x,
+        __global const unsigned int *nodesPerColorPrefix,
+        const unsigned int color,
+        const unsigned int block_size,
+        __local double *tmp)
+    {
+        const unsigned int warpsize = 32;
+        const unsigned int bs = block_size;
+        const unsigned int idx_t = get_local_id(0);
+        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+        const unsigned int NUM_THREADS = get_global_size(0);
+        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+        unsigned int idx = get_global_id(0);
+        unsigned int target_block_row = idx / warpsize;
+        idx += nodesPerColorPrefix[color];
+        target_block_row += nodesPerColorPrefix[color];
+        const unsigned int lane = idx_t % warpsize;
+        const unsigned int c = (lane / bs) % bs;
+        const unsigned int r = lane % bs;
+
+        while(target_block_row < nodesPerColorPrefix[color+1]){
+            const unsigned int first_block = Lrows[target_block_row];
+            const unsigned int last_block = Lrows[target_block_row+1];
+            unsigned int block = first_block + lane / (bs*bs);
+            double local_out = 0.0;
+            if(lane < num_active_threads){
+                if(lane < bs){
+                    local_out = y[target_block_row*bs+lane];
+                }
+                for(; block < last_block; block += num_blocks_per_warp){
+                    const double x_elem = x[Lcols[block]*bs + c];
+                    const double A_elem = Lvals[block*bs*bs + c + r*bs];
+                    local_out -= x_elem * A_elem;
+                }
+            }
+
+            // do reduction in shared mem
+            tmp[lane] = local_out;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+            {
+                if (lane + offset < warpsize)
+                {
+                    tmp[lane] += tmp[lane + offset];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if(lane < bs){
+                const unsigned int row = target_block_row*bs + lane;
+                x[row] = tmp[lane];
+            }
+
+            target_block_row += num_warps_in_grid;
+        }
+    }
+    )";
+
+
+    // ILU apply part 2: backward substitution
+    // solves U*x=y where L is a lower triangular sparse blocked matrix
+    const char* ILU_apply2_s = R"(
+    __kernel void ILU_apply2(
+        __global const double *Uvals,
+        __global const int *Ucols,
+        __global const int *Urows,
+        const unsigned int UrowSize,
+        __global const double *invDiagVals,
+        __global double *x,
+        __global const unsigned int *nodesPerColorPrefix,
+        const unsigned int color,
+        const unsigned int block_size,
+        __local double *tmp)
+    {
+        const unsigned int warpsize = 32;
+        const unsigned int bs = block_size;
+        const unsigned int idx_t = get_local_id(0);
+        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+        const unsigned int NUM_THREADS = get_global_size(0);
+        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+        unsigned int idx = get_global_id(0);
+        unsigned int target_block_row = idx / warpsize;
+        idx += nodesPerColorPrefix[color];
+        target_block_row += nodesPerColorPrefix[color];
+        const unsigned int lane = idx_t % warpsize;
+        const unsigned int c = (lane / bs) % bs;
+        const unsigned int r = lane % bs;
+        const double relaxation = 0.9;
+
+        while(target_block_row < nodesPerColorPrefix[color+1]){
+            const unsigned int first_block = Urows[UrowSize-target_block_row-1];
+            const unsigned int last_block = Urows[UrowSize-target_block_row];
+            unsigned int block = first_block + lane / (bs*bs);
+            double local_out = 0.0;
+            if(lane < num_active_threads){
+                if(lane < bs){
+                    const unsigned int row = target_block_row*bs+lane;
+                    local_out = x[row];
+                }
+                for(; block < last_block; block += num_blocks_per_warp){
+                    const double x_elem = x[Ucols[block]*bs + c];
+                    const double A_elem = Uvals[block*bs*bs + c + r*bs];
+                    local_out -= x_elem * A_elem;
+                }
+            }
+
+            // do reduction in shared mem
+            tmp[lane] = local_out;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+            {
+                if (lane + offset < warpsize)
+                {
+                    tmp[lane] += tmp[lane + offset];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+            local_out = tmp[lane];
+
+            if(lane < bs){
+                tmp[lane + bs*idx_t/warpsize] = local_out;
+                double sum = 0.0;
+                for(int i = 0; i < bs; ++i){
+                    sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                }
+
+                const unsigned int row = target_block_row*bs + lane;
+                x[row] = relaxation * sum;
+            }
+
+            target_block_row += num_warps_in_grid;
+        }
+    }
+    )";
+
+
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -0,0 +1,743 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <cmath>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+
+
+// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
+// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
+#define COPY_ROW_BY_ROW 0
+
+// Level Scheduling respects the depencies in the original matrix
+// Graph Coloring is more aggresive and is likely to change the number of linearizations and linear iterations to converge, but can still be faster on GPU because it results in more parallelism
+#define LEVEL_SCHEDULING 0
+#define GRAPH_COLORING   1
+
+namespace bda
+{
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <unsigned int block_size>
+openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
+    prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
+}
+
+
+template <unsigned int block_size>
+openclSolverBackend<block_size>::~openclSolverBackend() {
+    finalize();
+}
+
+
+// divide A by B, and round up: return (int)ceil(A/B)
+template <unsigned int block_size>
+unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
+{
+    return A / B + (A % B > 0);
+}
+
+
+template <unsigned int block_size>
+double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
+{
+    const unsigned int work_group_size = 1024;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_dot;
+
+    cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
+
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
+
+    double gpu_sum = 0.0;
+    for (unsigned int i = 0; i < num_work_groups; ++i) {
+        gpu_sum += tmp[i];
+    }
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+
+    return gpu_sum;
+}
+
+template <unsigned int block_size>
+double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
+{
+    const unsigned int work_group_size = 1024;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_norm;
+
+    cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
+
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
+
+    double gpu_norm = 0.0;
+    for (unsigned int i = 0; i < num_work_groups; ++i) {
+        gpu_norm += tmp[i];
+    }
+    gpu_norm = sqrt(gpu_norm);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+
+    return gpu_norm;
+}
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t_axpy;
+
+    cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    Timer t_custom;
+
+    cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
+{
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    Timer t_spmv;
+
+    cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+
+    if (verbosity >= 4) {
+        event.wait();
+        std::ostringstream oss;
+        oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s";
+        OpmLog::info(oss.str());
+    }
+}
+
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+
+    float it;
+    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
+    double norm, norm_0;
+
+    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
+
+    wellContribs.setOpenCLQueue(queue.get());
+    wellContribs.setReordering(toOrder, true);
+
+    // set r to the initial residual
+    // if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
+    //applyblockedscaleadd(-1.0, mat, x, r);
+
+    // set initial values
+    cl::Event event;
+    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
+    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+    rho = 1.0;
+    alpha = 1.0;
+    omega = 1.0;
+
+    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+
+    norm = norm_w(d_r, d_tmp);
+    norm_0 = norm;
+
+    if (verbosity > 1) {
+        std::ostringstream out;
+        out << std::scientific << "openclSolver initial norm: " << norm_0;
+        OpmLog::info(out.str());
+    }
+
+    t_rest.start();
+    for (it = 0.5; it < maxit; it += 0.5) {
+        rhop = rho;
+        rho = dot_w(d_rw, d_r, d_tmp);
+
+        if (it > 1) {
+            beta = (rho / rhop) * (alpha / omega);
+            custom_w(d_p, d_v, d_r, omega, beta);
+        }
+        t_rest.stop();
+
+        // pw = prec(p)
+        t_prec.start();
+        prec->apply(d_p, d_pw);
+        t_prec.stop();
+
+        // v = A * pw
+        t_spmv.start();
+        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
+        t_spmv.stop();
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            t_well.start();
+            wellContribs.apply(d_pw, d_v);
+            t_well.stop();
+        }
+
+        t_rest.start();
+        tmp1 = dot_w(d_rw, d_v, d_tmp);
+        alpha = rho / tmp1;
+        axpy_w(d_v, -alpha, d_r);      // r = r - alpha * v
+        axpy_w(d_pw, alpha, d_x);      // x = x + alpha * pw
+        norm = norm_w(d_r, d_tmp);
+        t_rest.stop();
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }
+
+        it += 0.5;
+
+        // s = prec(r)
+        t_prec.start();
+        prec->apply(d_r, d_s);
+        t_prec.stop();
+
+        // t = A * s
+        t_spmv.start();
+        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
+        t_spmv.stop();
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            t_well.start();
+            wellContribs.apply(d_s, d_t);
+            t_well.stop();
+        }
+
+        t_rest.start();
+        tmp1 = dot_w(d_t, d_r, d_tmp);
+        tmp2 = dot_w(d_t, d_t, d_tmp);
+        omega = tmp1 / tmp2;
+        axpy_w(d_s, omega, d_x);     // x = x + omega * s
+        axpy_w(d_t, -omega, d_r);    // r = r - omega * t
+        norm = norm_w(d_r, d_tmp);
+        t_rest.stop();
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }
+
+        if (verbosity > 1) {
+            std::ostringstream out;
+            out << "it: " << it << std::scientific << ", norm: " << norm;
+            OpmLog::info(out.str());
+        }
+    }
+
+    res.iterations = std::min(it, (float)maxit);
+    res.reduction = norm / norm_0;
+    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / it));
+    res.elapsed = t_total.stop();
+    res.converged = (it != (maxit + 0.5));
+
+    if (verbosity > 0) {
+        std::ostringstream out;
+        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        OpmLog::info(out.str());
+    }
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "openclSolver::ily_apply:   " << t_prec.elapsed() << " s\n";
+        out << "openclSolver::spmv:        " << t_spmv.elapsed() << " s\n";
+        out << "openclSolver::rest:        " << t_rest.elapsed() << " s\n";
+        out << "openclSolver::total_solve: " << res.elapsed << " s\n";
+        OpmLog::info(out.str());
+    }
+}
+
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
+    this->N = N_;
+    this->nnz = nnz_;
+    this->nnzb = nnz_ / block_size / block_size;
+
+    Nb = (N + dim - 1) / dim;
+    std::ostringstream out;
+    out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
+    OpmLog::info(out.str());
+    out.str("");
+    out.clear();
+
+    cl_int err = CL_SUCCESS;
+    try {
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        if (platforms.size() == 0) {
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
+        }
+        out << "Found " << platforms.size() << " OpenCL platforms" << "\n";
+
+        if (verbosity >= 1) {
+            std::string platform_info;
+            for (unsigned int i = 0; i < platforms.size(); ++i) {
+                platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info);
+                out << "Platform name      : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info);
+                out << "Platform vendor    : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info);
+                out << "Platform version   : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info);
+                out << "Platform profile   : " << platform_info << "\n";
+                platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
+                out << "Platform extensions: " << platform_info << "\n\n";
+            }
+        }
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+
+        if (platforms.size() <= platformID) {
+            OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
+        } else {
+            std::string platform_info;
+            out << "Chosen:\n";
+            platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
+            out << "Platform name      : " << platform_info << "\n";
+            platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
+            out << "Platform version   : " << platform_info << "\n";
+            OpmLog::info(out.str());
+            out.str("");
+            out.clear();
+        }
+
+        cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0};
+        context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
+
+        std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
+        if (devices.size() == 0){
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
+        }
+        out << "Found " << devices.size() << " OpenCL devices" << "\n";
+
+        if (verbosity >= 1) {
+            for (unsigned int i = 0; i < devices.size(); ++i) {
+                std::string device_info;
+                std::vector<size_t> work_sizes;
+                std::vector<cl_device_partition_property> partitions;
+
+                devices[i].getInfo(CL_DEVICE_NAME, &device_info);
+                out << "CL_DEVICE_NAME            : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
+                out << "CL_DEVICE_VENDOR          : " << device_info << "\n";
+                devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
+                out << "CL_DRIVER_VERSION         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
+                out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
+                out << "CL_DEVICE_PROFILE         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
+                out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
+                out << "CL_DEVICE_EXTENSIONS      : " << device_info << "\n";
+
+                devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
+                for (unsigned int j = 0; j < work_sizes.size(); ++j) {
+                    out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
+                }
+                devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
+                for (unsigned int j = 0; j < partitions.size(); ++j) {
+                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
+                }
+                partitions.clear();
+                devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
+                for (unsigned int j = 0; j < partitions.size(); ++j) {
+                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
+                }
+
+                // C-style properties
+                cl_device_id tmp_id = devices[i]();
+                cl_ulong size;
+                clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_LOCAL_MEM_SIZE       : " << size / 1024 << " KB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_COMPUTE_UNITS    : " << size << "\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE   : " << size / 1024 / 1024 << " MB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_WORK_GROUP_SIZE  : " << size << "\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n\n";
+            }
+        }
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+
+        if (devices.size() <= deviceID){
+            OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID");
+        } else {
+            std::string device_info;
+            out << "Chosen:\n";
+            devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info);
+            out << "CL_DEVICE_NAME            : " << device_info << "\n";
+            devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info);
+            out << "CL_DEVICE_VERSION         : " << device_info << "\n";
+            OpmLog::info(out.str());
+            out.str("");
+            out.clear();
+        }
+
+        cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
+        source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
+        source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
+        source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
+        source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
+        source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
+        source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
+        cl::Program program_ = cl::Program(*context, source);
+
+        program_.build(devices);
+
+        cl::Event event;
+        queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
+
+        prec->setOpenCLContext(context.get());
+        prec->setOpenCLQueue(queue.get());
+
+        rb = new double[N];
+        tmp = new double[N];
+#if COPY_ROW_BY_ROW
+        vals_contiguous = new double[N];
+#endif
+
+        mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
+
+        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+
+        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
+        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
+        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
+
+        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
+        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
+
+        // actually creating the kernels
+        dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "dot_1")));
+        norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "norm")));
+        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program_, "axpy")));
+        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program_, "custom")));
+        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
+        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
+        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
+
+        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
+
+    } catch (cl::Error error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err());
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (std::logic_error error) {
+        // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
+        throw error;
+    }
+
+
+    initialized = true;
+} // end initialize()
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::finalize() {
+    delete[] rb;
+    delete[] tmp;
+#if COPY_ROW_BY_ROW
+    delete[] vals_contiguous;
+#endif
+    delete prec;
+} // end finalize()
+
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::copy_system_to_gpu() {
+    Timer t;
+    cl::Event event;
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
+        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+#else
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+#endif
+
+    queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
+    queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
+    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end copy_system_to_gpu()
+
+
+// don't copy rowpointers and colindices, they stay the same
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::update_system_on_gpu() {
+    Timer t;
+    cl::Event event;
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
+        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+#else
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+#endif
+
+    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system_on_gpu()
+
+
+template <unsigned int block_size>
+bool openclSolverBackend<block_size>::analyse_matrix() {
+    Timer t;
+
+    bool success = prec->init(mat.get());
+    int work_group_size = 32;
+    int num_work_groups = ceilDivision(N, work_group_size);
+    int total_work_items = num_work_groups * work_group_size;
+    int lmem_per_work_group = work_group_size * sizeof(double);
+    prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
+
+    toOrder = prec->getToOrder();
+    fromOrder = prec->getFromOrder();
+    rmat = prec->getRMat();
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::analyse_matrix(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    analysis_done = true;
+
+    return success;
+} // end analyse_matrix()
+
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
+    Timer t;
+
+    mat->nnzValues = vals;
+    reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::update_system(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system()
+
+
+template <unsigned int block_size>
+bool openclSolverBackend<block_size>::create_preconditioner() {
+    Timer t;
+
+    bool result = prec->create_preconditioner(mat.get());
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::create_preconditioner(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+    return result;
+} // end create_preconditioner()
+
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
+    Timer t;
+
+    // actually solve
+    try {
+        gpu_pbicgstab(wellContribs, res);
+    } catch (cl::Error error) {
+        std::ostringstream oss;
+        oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err());
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (std::logic_error error) {
+        // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
+        throw error;
+    }
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::solve_system(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+} // end solve_system()
+
+
+// copy result to host memory
+// caller must be sure that x is a valid array
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::get_result(double *x) {
+    Timer t;
+
+    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
+    reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
+
+    if (verbosity > 2) {
+        std::ostringstream out;
+        out << "openclSolver::get_result(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end get_result()
+
+
+
+template <unsigned int block_size>
+SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+    if (initialized == false) {
+        initialize(N_, nnz_,  dim, vals, rows, cols);
+        if (analysis_done == false) {
+            if (!analyse_matrix()) {
+                return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+            }
+        }
+        update_system(vals, b);
+        if (!create_preconditioner()) {
+            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        copy_system_to_gpu();
+    } else {
+        update_system(vals, b);
+        if (!create_preconditioner()) {
+            return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        update_system_on_gpu();
+    }
+    solve_system(wellContribs, res);
+    return SolverStatus::BDA_SOLVER_SUCCESS;
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                 \
+template openclSolverBackend<n>::openclSolverBackend(int, int, double, unsigned int, unsigned int);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda
+
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -0,0 +1,211 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/opencl.hpp>
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+
+namespace bda
+{
+
+/// This class implements a opencl-based ilu0-bicgstab solver on GPU
+template <unsigned int block_size>
+class openclSolverBackend : public BdaSolver<block_size>
+{
+
+    typedef BdaSolver<block_size> Base;
+    typedef BILU0<block_size> Preconditioner;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::platformID;
+    using Base::deviceID;
+    using Base::maxit;
+    using Base::tolerance;
+    using Base::initialized;
+
+private:
+
+    double *rb = nullptr;                 // reordered b vector, the matrix is reordered, so b must also be
+    double *vals_contiguous = nullptr;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
+
+    bool analysis_done = false;
+
+    // OpenCL variables must be reusable, they are initialized in initialize()
+    cl::Buffer d_Avals, d_Acols, d_Arows;        // (reordered) matrix in BSR format on GPU
+    cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p;   // vectors, used during linear solve
+    cl::Buffer d_pw, d_s, d_t, d_v;              // vectors, used during linear solve
+    cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
+    double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()
+
+    // shared pointers are also passed to other objects
+    std::shared_ptr<cl::Context> context;
+    std::shared_ptr<cl::CommandQueue> queue;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
+
+    Preconditioner *prec = nullptr;                  // only supported preconditioner is BILU0
+    int *toOrder = nullptr, *fromOrder = nullptr;    // BILU0 reorders rows of the matrix via these mappings
+    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
+    BlockedMatrix<block_size> *rmat = nullptr;       // reordered matrix, used for spmv
+
+
+
+    /// Divide A by B, and round up: return (int)ceil(A/B)
+    /// \param[in] A    dividend
+    /// \param[in] B    divisor
+    /// \return         rounded division result
+    unsigned int ceilDivision(const unsigned int A, const unsigned int B);
+
+    /// Calculate dot product between in1 and in2, partial sums are stored in out, which are summed on CPU
+    /// \param[in] in1           input vector 1
+    /// \param[in] in2           input vector 2
+    /// \param[out] out          output vector containing partial sums
+    /// \return                  dot product
+    double dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out);
+
+    /// Calculate the norm of in, partial sums are stored in out, which are summed on the CPU
+    /// Equal to Dune::DenseVector::two_norm()
+    /// \param[in] in          input vector
+    /// \param[out] out        output vector containing partial sums
+    /// \return                norm
+    double norm_w(cl::Buffer in, cl::Buffer out);
+
+    /// Perform axpy: out += a * in
+    /// \param[in] in         input vector
+    /// \param[in] a          scalar value to multiply input vector
+    /// \param[inout] out     output vector
+    void axpy_w(cl::Buffer in, const double a, cl::Buffer out);
+
+    /// Custom function that combines scale, axpy and add functions in bicgstab
+    /// p = (p - omega * v) * beta + r
+    /// \param[inout] p      output vector
+    /// \param[in] v         input vector
+    /// \param[in] r         input vector
+    /// \param[in] omega     scalar value
+    /// \param[in] beta      scalar value
+    void custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta);
+
+    /// Sparse matrix-vector multiply, spmv
+    /// b = A * x
+    /// Matrix A, must be in BCRS format
+    /// \param[in] vals     nnzs of matrix A
+    /// \param[in] cols     columnindices of matrix A
+    /// \param[in] rows     rowpointers of matrix A
+    /// \param[in] x        input vector
+    /// \param[out] b       output vector
+    void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
+
+    /// Solve linear system using ilu0-bicgstab
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
+
+    /// Initialize GPU and allocate memory
+    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+
+    /// Clean memory
+    void finalize();
+
+    /// Copy linear system to GPU
+    void copy_system_to_gpu();
+
+    /// Reorder the linear system so it corresponds with the coloring
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] b              input vectors, contains N values
+    void update_system(double *vals, double *b);
+
+    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
+    void update_system_on_gpu();
+
+    /// Analyse sparsity pattern to extract parallelism
+    /// \return true iff analysis was successful
+    bool analyse_matrix();
+
+    /// Perform ilu0-decomposition
+    /// \return true iff decomposition was successful
+    bool create_preconditioner();
+
+    /// Solve linear system
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    void solve_system(WellContributions& wellContribs, BdaResult &res);
+
+public:
+
+    /// Construct a openclSolver
+    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
+    /// \param[in] maxit                      maximum number of iterations for openclSolver
+    /// \param[in] tolerance                  required relative tolerance for openclSolver
+    /// \param[in] platformID                 the OpenCL platform to be used
+    /// \param[in] deviceID                   the device to be used
+    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
+
+    /// Destroy a openclSolver, and free memory
+    ~openclSolverBackend();
+
+    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
+    /// \param[in] N              number of rows, divide by dim to get number of blockrows
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] vals_prec      array of nonzeroes for preconditioner
+    /// \param[in] rows_prec      array of rowPointers for preconditioner
+    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    /// \return                   status code
+    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+
+    /// Get result after linear solve, and peform postprocessing if necessary
+    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
+    void get_result(double *x) override;
+
+}; // end class openclSolverBackend
+
+} // namespace bda
+
+#endif
+
+
--- a/opm/simulators/wells/BlackoilWellModel.hpp
+++ b/opm/simulators/wells/BlackoilWellModel.hpp
@ -224,7 +224,7 @@ namespace Opm {
            // subtract B*inv(D)*C * x from A*x
            void apply(const BVector& x, BVector& Ax) const;

-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
            // accumulate the contributions of all Wells in the WellContributions object
            void getWellContributions(WellContributions& x) const;
 #endif
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@ -898,7 +898,7 @@ namespace Opm {
        }
    }

-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
    template<typename TypeTag>
    void
    BlackoilWellModel<TypeTag>::
--- a/opm/simulators/wells/MultisegmentWell.hpp
+++ b/opm/simulators/wells/MultisegmentWell.hpp
@ -135,7 +135,7 @@ namespace Opm
        /// r = r - C D^-1 Rw
        virtual void apply(BVector& r) const override;

-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
        /// add the contribution (C, D, B matrices) of this Well to the WellContributions object
        void addWellContribution(WellContributions& wellContribs) const;
 #endif
--- a/opm/simulators/wells/MultisegmentWell_impl.hpp
+++ b/opm/simulators/wells/MultisegmentWell_impl.hpp
@ -648,7 +648,7 @@ namespace Opm



-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
    template<typename TypeTag>
    void
    MultisegmentWell<TypeTag>::
--- a/opm/simulators/wells/StandardWell.hpp
+++ b/opm/simulators/wells/StandardWell.hpp
@ -23,6 +23,9 @@
 #ifndef OPM_STANDARDWELL_HEADER_INCLUDED
 #define OPM_STANDARDWELL_HEADER_INCLUDED

+#if HAVE_CUDA || HAVE_OPENCL
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+#endif

 #include <opm/simulators/wells/RateConverter.hpp>
 #include <opm/simulators/wells/WellInterface.hpp>
@ -184,7 +187,7 @@ namespace Opm
        /// r = r - C D^-1 Rw
        virtual void apply(BVector& r) const override;

-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
        /// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object
        void addWellContribution(WellContributions& wellContribs) const;

--- a/opm/simulators/wells/StandardWell_impl.hpp
+++ b/opm/simulators/wells/StandardWell_impl.hpp
@ -2303,7 +2303,7 @@ namespace Opm
        duneC_.mmtv(invDrw_, r);
    }

-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_OPENCL
    template<typename TypeTag>
    void
    StandardWell<TypeTag>::