Added openclSolver

Usage: --gpu-mode=[cusparse|opencl|none] on command line
2025-02-25 18:55:30 -06:00 · 2020-06-22 18:26:49 +02:00 · 2020-06-22 18:26:49 +02:00 · f974a5f6db
commit f974a5f6db
parent e4ea932a9d
24 changed files with 3715 additions and 964 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -134,6 +134,14 @@ if(CUDA_FOUND)
  include_directories(${CUDA_INCLUDE_DIRS})
 endif()

+
+find_package(OpenCL)
+
+if(OpenCL_FOUND)
+  set(HAVE_OPENCL 1)
+  include_directories(${OpenCL_INCLUDE_DIRS})
+endif()
+
 # read the list of components from this file (in the project directory);
 # it should set various lists with the names of the files to include
 include (CMakeLists_files.cmake)
@ -386,3 +394,7 @@ if(CUDA_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
 endif()

+if(OpenCL_FOUND)
+  target_link_libraries( opmsimulators ${OpenCL_LIBRARIES} )
+endif()
+
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -46,6 +46,13 @@ if(CUDA_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
+if(OPENCL_FOUND)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
+endif()
 if(MPI_FOUND)
  list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
                                opm/simulators/utils/ParallelSerialization.cpp)
@ -142,8 +149,14 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp
  opm/simulators/linalg/bda/BdaBridge.hpp
  opm/simulators/linalg/bda/BdaResult.hpp
+  opm/simulators/linalg/bda/BdaSolver.hpp
+  opm/simulators/linalg/bda/BILU0.hpp
+  opm/simulators/linalg/bda/BlockedMatrix.hpp
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+  opm/simulators/linalg/bda/Reorder.hpp
+  opm/simulators/linalg/bda/openclKernels.hpp
+  opm/simulators/linalg/bda/openclSolverBackend.hpp
  opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
  opm/simulators/linalg/bda/WellContributions.hpp
  opm/simulators/linalg/BlackoilAmg.hpp
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@ -6,6 +6,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_MPI
  HAVE_PETSC
  HAVE_CUDA
+  HAVE_OPENCL
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -67,7 +67,7 @@ NEW_PROP_TAG(CprEllSolvetype);
 NEW_PROP_TAG(CprReuseSetup);
 NEW_PROP_TAG(LinearSolverConfiguration);
 NEW_PROP_TAG(LinearSolverConfigurationJsonFile);
-NEW_PROP_TAG(UseGpu);
+NEW_PROP_TAG(GpuMode);

 SET_SCALAR_PROP(FlowIstlSolverParams, LinearSolverReduction, 1e-2);
 SET_SCALAR_PROP(FlowIstlSolverParams, IluRelaxation, 0.9);
@ -94,7 +94,7 @@ SET_INT_PROP(FlowIstlSolverParams, CprEllSolvetype, 0);
 SET_INT_PROP(FlowIstlSolverParams, CprReuseSetup, 3);
 SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfiguration, "ilu0");
 SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfigurationJsonFile, "none");
-SET_BOOL_PROP(FlowIstlSolverParams, UseGpu, false);
+SET_STRING_PROP(FlowIstlSolverParams, GpuMode, "none");



@ -167,7 +167,7 @@ namespace Opm
        bool scale_linear_system_;
        std::string linear_solver_configuration_;
        std::string linear_solver_configuration_json_file_;
-        bool use_gpu_;
+        std::string gpu_mode_;

        template <class TypeTag>
        void init()
@ -196,7 +196,7 @@ namespace Opm
            cpr_reuse_setup_  =  EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
            linear_solver_configuration_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfiguration);
            linear_solver_configuration_json_file_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile);
-            use_gpu_ = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
        }

        template <class TypeTag>
@ -225,7 +225,7 @@ namespace Opm
            EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse Amg Setup");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfiguration, "Configuration of solver valid is: ilu0 (default), cpr_quasiimpes, cpr_trueimpes or file (specified in LinearSolverConfigurationJsonFile) ");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile, "Filename of JSON configuration for flexible linear solver system.");
-            EWOMS_REGISTER_PARAM(TypeTag, bool, UseGpu, "Use GPU cusparseSolver as the linear solver");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver");
        }

        FlowLinearSolverParameters() { reset(); }
@ -247,7 +247,7 @@ namespace Opm
            ilu_milu_                 = MILU_VARIANT::ILU;
            ilu_redblack_             = false;
            ilu_reorder_sphere_       = true;
-            use_gpu_                  = false;
+            gpu_mode_                 = "none";
        }
    };

--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -317,18 +317,18 @@ protected:
            }
            const auto& gridForConn = simulator_.vanguard().grid();
 #if HAVE_CUDA
-            bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
-            if (gridForConn.comm().size() > 1 && use_gpu) {
+            std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            if (gridForConn.comm().size() > 1 && "none" != gpu_mode) {
                OpmLog::warning("Warning cannot use GPU with MPI, GPU is disabled");
-                use_gpu = false;
+                gpu_mode = "none";
            }
            const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
            const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
            const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
-            bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
+            bdaBridge.reset(new BdaBridge(gpu_mode, linear_solver_verbosity, maxit, tolerance));
 #else
-            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
-            if (use_gpu) {
+            const bool gpu_mode = EWOMS_GET_PARAM(TypeTag, bool, GpuMode);
+            if ("none" != gpu_mode) {
                OPM_THROW(std::logic_error,"Error cannot use GPU solver since CUDA was not found during compilation");
            }
 #endif
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -0,0 +1,352 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sys/time.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+
+
+namespace bda
+{
+
+    using Opm::OpmLog;
+
+    BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
+        level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_)
+    {
+        if (level_scheduling == graph_coloring) {
+            OPM_THROW(std::logic_error, "Error, either level_scheduling or graph_coloring must be true, not both\n");
+        }
+    }
+
+    BILU0::~BILU0()
+    {
+        delete[] invDiagVals;
+        delete[] diagIndex;
+        delete[] rowsPerColor;
+        freeBlockedMatrix(&LUMat);
+        freeBlockedMatrix(&LMat);
+        freeBlockedMatrix(&UMat);
+        delete[] toOrder;
+        delete[] fromOrder;
+        freeBlockedMatrix(&rMat);
+    }
+
+    bool BILU0::init(BlockedMatrix *mat, unsigned int block_size)
+    {
+        double t1 = 0.0, t2 = 0.0;
+        BlockedMatrix *CSCmat = nullptr;
+
+        this->N = mat->Nb * block_size;
+        this->Nb = mat->Nb;
+        this->nnz = mat->nnzbs * block_size * block_size;
+        this->nnzbs = mat->nnzbs;
+        this->block_size = block_size;
+
+        toOrder = new int[N];
+        fromOrder = new int[N];
+        if (level_scheduling) {
+            CSCmat = new BlockedMatrix();
+            CSCmat->Nb = Nb;
+            CSCmat->nnzbs = nnzbs;
+            CSCmat->nnzValues = new Block[nnzbs];
+            CSCmat->colIndices = new int[nnzbs];
+            CSCmat->rowPointers = new int[Nb + 1];
+            if(verbosity >= 3){
+                t1 = BdaSolver::second();
+            }
+            bcsr_to_bcsc(mat->nnzValues, mat->colIndices, mat->rowPointers, CSCmat->nnzValues, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb);
+            if(verbosity >= 3){
+                t2 = BdaSolver::second();
+                std::ostringstream out;
+                out << "BILU0 convert CSR to CSC: " << t2 - t1 << " s";
+                OpmLog::info(out.str());
+            }
+        }
+
+        if(verbosity >= 3){
+            t1 = BdaSolver::second();
+        }
+        rMat = allocateBlockedMatrix(mat->Nb, mat->nnzbs);
+        LUMat = soft_copyBlockedMatrix(rMat);
+        if (level_scheduling) {
+            rowsPerColor = findLevelScheduling(mat->colIndices, mat->rowPointers, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb, &numColors, toOrder, fromOrder);
+        } else if (graph_coloring) {
+            rowsPerColor = findGraphColoring(mat->colIndices, mat->rowPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder);
+        }
+        if (rowsPerColor == nullptr) {
+            return false;
+        }
+        if(verbosity >= 3){
+            t2 = BdaSolver::second();
+            std::ostringstream out;
+            out << "BILU0 analysis took: " << t2 - t1 << " s, " << numColors << " colors";
+            OpmLog::info(out.str());
+        }
+
+        diagIndex = new int[mat->Nb];
+        invDiagVals = new Block[mat->Nb];
+
+        LMat = allocateBlockedMatrix(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+        UMat = allocateBlockedMatrix(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+
+        LUMat->nnzValues = new Block[mat->nnzbs];
+
+        if (level_scheduling) {
+            delete[] CSCmat->nnzValues;
+            delete[] CSCmat->colIndices;
+            delete[] CSCmat->rowPointers;
+            delete CSCmat;
+        }
+
+
+        s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * LMat->nnzbs);
+        s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * UMat->nnzbs);
+        s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LMat->nnzbs);
+        s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * UMat->nnzbs);
+        s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LMat->Nb + 1));
+        s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (UMat->Nb + 1));
+        s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * mat->Nb);
+        s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
+
+        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, LMat->nnzbs * sizeof(Block), LMat->nnzValues);
+        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(Block), UMat->nnzValues);
+        queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices);
+        queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices);
+        queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (LMat->Nb + 1) * sizeof(int), LMat->rowPointers);
+        queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (UMat->Nb + 1) * sizeof(int), UMat->rowPointers);
+        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(Block), invDiagVals);
+
+        int *rowsPerColorPrefix = new int[numColors + 1];
+        int prefix = 0;
+        for (int i = 0; i < numColors + 1; ++i) {
+            rowsPerColorPrefix[i] = prefix;
+            prefix += rowsPerColor[i];
+        }
+        queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
+        delete[] rowsPerColorPrefix;
+
+        return true;
+    } // end init()
+
+
+    bool BILU0::create_preconditioner(BlockedMatrix *mat)
+    {
+        double t1 = 0.0, t2 = 0.0;
+        if (verbosity >= 3){
+            t1 = BdaSolver::second();
+        }
+        blocked_reorder_matrix_by_pattern(mat, toOrder, fromOrder, rMat);
+        if (verbosity >= 3){
+            t2 = BdaSolver::second();
+            std::ostringstream out;
+            out << "BILU0 reorder matrix: " << t2 - t1 << " s";
+            OpmLog::info(out.str());
+        }
+
+        // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+        if (verbosity >= 3){
+            t1 = BdaSolver::second();
+        }
+        memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(Block) * rMat->nnzbs);
+        if (verbosity >= 3){
+            t2 = BdaSolver::second();
+            std::ostringstream out;
+            out << "BILU0 memcpy: " << t2 - t1 << " s";
+            OpmLog::info(out.str());
+        }
+
+        int i, j, ij, ik, jk;
+        int iRowStart, iRowEnd, jRowEnd;
+        Block pivot;
+
+        int LSize = 0;
+        const int blockSquare = block_size * block_size;
+
+        if (verbosity >= 3){
+            t1 = BdaSolver::second();
+        }
+        // go through all rows
+        for (i = 0; i < LUMat->Nb; i++) {
+            iRowStart = LUMat->rowPointers[i];
+            iRowEnd = LUMat->rowPointers[i + 1];
+
+            // go through all elements of the row
+            for (ij = iRowStart; ij < iRowEnd; ij++) {
+                j = LUMat->colIndices[ij];
+                // if the element is the diagonal, store the index and go to next row
+                if (j == i) {
+                    diagIndex[i] = ij;
+                    break;
+                }
+                // if an element beyond the diagonal is reach, no diagonal was found
+                // throw an error now. TODO: perform reordering earlier to prevent this
+                if (j > i) {
+                    std::ostringstream out;
+                    out << "BILU0 Error could not find diagonal value in row: " << i;
+                    OpmLog::error(out.str());
+                    return false;
+                }
+
+                LSize++;
+                // calculate the pivot of this row
+                blockMult(LUMat->nnzValues[ij], invDiagVals[j], pivot);
+
+                memcpy(LUMat->nnzValues[ij], &pivot, sizeof(double)*block_size * block_size);
+
+                jRowEnd = LUMat->rowPointers[j + 1];
+                jk = diagIndex[j] + 1;
+                ik = ij + 1;
+                // substract that row scaled by the pivot from this row.
+                while (ik < iRowEnd && jk < jRowEnd) {
+                    if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
+                        blockMultSub(LUMat->nnzValues[ik], pivot, LUMat->nnzValues[jk]);
+                        ik++;
+                        jk++;
+                    } else {
+                        if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
+                        { ik++; }
+                        else
+                        { jk++; }
+                    }
+                }
+            }
+            // store the inverse in the diagonal!
+            blockInvert3x3(LUMat->nnzValues[ij], invDiagVals[i]);
+
+            memcpy(LUMat->nnzValues[ij], invDiagVals[i], sizeof(double)*block_size * block_size);
+        }
+
+        LMat->rowPointers[0] = 0;
+        UMat->rowPointers[0] = 0;
+
+        // Split the LU matrix into two by comparing column indices to diagonal indices
+        for (i = 0; i < LUMat->Nb; i++) {
+            int offsetL = LMat->rowPointers[i];
+            int rowSize = diagIndex[i] - LUMat->rowPointers[i];
+            int offsetLU = LUMat->rowPointers[i];
+            memcpy(LMat->nnzValues[offsetL], LUMat->nnzValues[offsetLU], sizeof(double) * blockSquare * rowSize);
+            memcpy(LMat->colIndices + offsetL, LUMat->colIndices + offsetLU, sizeof(int) * rowSize);
+            offsetL += rowSize;
+            LMat->rowPointers[i + 1] = offsetL;
+        }
+        // Reverse the order or the (blocked) rows for the U matrix,
+        // because the rows are accessed in reverse order when applying the ILU0
+        int URowIndex = 0;
+        for (i = LUMat->Nb - 1; i >= 0; i--) {
+            int offsetU = UMat->rowPointers[URowIndex];
+            int rowSize = LUMat->rowPointers[i + 1] - diagIndex[i] - 1;
+            int offsetLU = diagIndex[i] + 1;
+            memcpy(UMat->nnzValues[offsetU], LUMat->nnzValues[offsetLU], sizeof(double) * blockSquare * rowSize);
+            memcpy(UMat->colIndices + offsetU, LUMat->colIndices + offsetLU, sizeof(int) * rowSize);
+            offsetU += rowSize;
+            UMat->rowPointers[URowIndex + 1] = offsetU;
+            URowIndex++;
+        }
+        if (verbosity >= 3) {
+            t2 = BdaSolver::second();
+            std::ostringstream out;
+            out << "BILU0 decomposition: " << t2 - t1 << " s";
+            OpmLog::info(out.str());
+        }
+
+        if (verbosity >= 3) {
+            t1 = BdaSolver::second();
+        }
+        if (pattern_uploaded == false) {
+            queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices);
+            queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices);
+            queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (LMat->Nb + 1) * sizeof(int), LMat->rowPointers);
+            queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (UMat->Nb + 1) * sizeof(int), UMat->rowPointers);
+            pattern_uploaded = true;
+        }
+        queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, LMat->nnzbs * sizeof(Block), LMat->nnzValues);
+        queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(Block), UMat->nnzValues);
+        queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(Block), invDiagVals);
+        if (verbosity >= 3) {
+            t2 = BdaSolver::second();
+            std::ostringstream out;
+            out << "BILU0 copy to GPU: " << t2 - t1 << " s";
+            OpmLog::info(out.str());
+        }
+
+        return true;
+    } // end create_preconditioner()
+
+    // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
+    // however, if individual kernel calls are timed, waiting for events is needed
+    // behavior on other GPUs is untested
+    void BILU0::apply(cl::Buffer& x, cl::Buffer& y)
+    {
+        double t1 = 0.0, t2 = 0.0;
+        if (verbosity >= 3) {
+            t1 = BdaSolver::second();
+        }
+        const unsigned int block_size = 3;
+        cl::Event event;
+
+        for(unsigned int color = 0; color < numColors; ++color){
+            event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            // event.wait();
+        }
+        for(int color = numColors-1; color >= 0; --color){
+            event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
+            // event.wait();
+        }
+
+        if (verbosity >= 3) {
+            event.wait();
+            t2 = BdaSolver::second();
+            std::ostringstream out;
+            out << "BILU0 apply: " << t2 - t1 << " s";
+            OpmLog::info(out.str());
+        }
+    }
+
+
+    void BILU0::setOpenCLContext(cl::Context *context){
+        this->context = context;
+    }
+    void BILU0::setOpenCLQueue(cl::CommandQueue *queue){
+        this->queue = queue;
+    }
+    void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
+        this->work_group_size = work_group_size;
+        this->total_work_items = total_work_items;
+        this->lmem_per_work_group = lmem_per_work_group;
+    }
+    void BILU0::setKernels(
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
+    ){
+        this->ILU_apply1 = ILU_apply1;
+        this->ILU_apply2 = ILU_apply2;
+    }
+
+} // end namespace bda
+
+
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -0,0 +1,118 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BILU0_HPP
+#define BILU0_HPP
+
+#include <config.h> // CMake
+
+#include <memory>
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+#if HAVE_OPENCL
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>            // up to OpenCL 1.2
+#endif
+
+namespace bda
+{
+
+    /// This class implementa a Blocked ILU0 preconditioner
+    /// The decomposition is done on CPU, and reorders the rows of the matrix
+    class BILU0
+    {
+
+    private:
+        int N;       // number of rows of the matrix
+        int Nb;      // number of blockrows of the matrix
+        int nnz;     // number of nonzeroes of the matrix (scalar)
+        int nnzbs;   // number of blocks of the matrix
+        unsigned int block_size;
+        BlockedMatrix *LMat, *UMat, *LUMat;
+        BlockedMatrix *rMat = nullptr; // only used with PAR_SIM
+        Block *invDiagVals;
+        int *diagIndex, *rowsPerColor;
+        int *toOrder, *fromOrder;
+        int numColors;
+        int verbosity;
+
+        bool level_scheduling, graph_coloring;
+
+        typedef struct {
+            cl::Buffer Lvals, Uvals, invDiagVals;
+            cl::Buffer Lcols, Lrows;
+            cl::Buffer Ucols, Urows;
+            cl::Buffer rowsPerColor;
+        } GPU_storage;
+
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
+        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
+        GPU_storage s;
+        cl::Context *context;
+        cl::CommandQueue *queue;
+        int work_group_size = 0;
+        int total_work_items = 0;
+        int lmem_per_work_group = 0;
+        bool pattern_uploaded = false;
+
+    public:
+
+        BILU0(bool level_scheduling, bool graph_coloring, int verbosity);
+
+        ~BILU0();
+
+        // analysis
+        bool init(BlockedMatrix *mat, unsigned int block_size);
+
+        // ilu_decomposition
+        bool create_preconditioner(BlockedMatrix *mat);
+
+        // apply preconditioner, y = prec(x)
+        void apply(cl::Buffer& x, cl::Buffer& y);
+
+        void setOpenCLContext(cl::Context *context);
+        void setOpenCLQueue(cl::CommandQueue *queue);
+        void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
+        void setKernels(
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
+            cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
+            );
+
+        int* getToOrder()
+        {
+            return toOrder;
+        }
+
+        int* getFromOrder()
+        {
+            return fromOrder;
+        }
+
+        BlockedMatrix* getRMat()
+        {
+            return rMat;
+        }
+
+    };
+
+} // end namespace bda
+
+#endif
+
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -35,11 +35,24 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
 namespace Opm
 {

-BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
-    : use_gpu(use_gpu_)
+    using bda::BdaResult;
+    using bda::BdaSolver;
+
+BdaBridge::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance)
+    : gpu_mode(gpu_mode_)
 {
-    if (use_gpu) {
-        backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+    if (gpu_mode == "cusparse") {
+        use_gpu = true;
+        backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+    } else if (gpu_mode == "opencl") {
+#if HAVE_OPENCL
+        use_gpu = true;
+        backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance));
+#else
+        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
+#endif
+    } else if (gpu_mode != "none") {
+        OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
    }
 }

@ -159,21 +172,21 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
        /////////////////////////
        // actually solve

-        typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
+        typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
-        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
+        BdaSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
        switch(status) {
-        case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
+        case BdaSolverStatus::BDA_SOLVER_SUCCESS:
            //OpmLog::info("cusparseSolver converged");
            break;
-        case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
-            OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+        case BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
+            OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
            break;
-        case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
-            OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+        case BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
+            OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
            break;
        default:
-            OpmLog::warning("cusparseSolver returned unknown status code");
+            OpmLog::warning("BdaSolver returned unknown status code");
        }

        res.iterations = result.iterations;
@ -190,7 +203,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
 template <class BridgeVector>
 void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
    if (use_gpu) {
-        backend->post_process(static_cast<double*>(&(x[0][0])));
+        backend->get_result(static_cast<double*>(&(x[0][0])));
    }
 }

--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -22,10 +22,6 @@

 #include <config.h>

-#if ! HAVE_CUDA
-  #error "This file should only be included if CUDA is found"
-#endif
-
 #include "dune/istl/solver.hh" // for struct InverseOperatorResult

 #include "dune/istl/bcrsmatrix.hh"
@ -33,7 +29,13 @@

 #include <opm/simulators/linalg/bda/WellContributions.hpp>

+#if HAVE_CUDA
 #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#endif
+
+#if HAVE_OPENCL
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#endif

 namespace Opm
 {
@ -45,16 +47,17 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
 class BdaBridge
 {
 private:
-    std::unique_ptr<cusparseSolverBackend> backend;
+    std::unique_ptr<bda::BdaSolver> backend;
+    std::string gpu_mode;
    bool use_gpu;

 public:
    /// Construct a BdaBridge
-    /// \param[in] use_gpu                    true iff the cusparseSolver is used, is passed via command-line: '--use-gpu=[true|false]'
-    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
-    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
-    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
-    BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
+    /// \param[in] gpu_mode                   to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
+    /// \param[in] linear_solver_verbosity    verbosity of BdaSolver
+    /// \param[in] maxit                      maximum number of iterations for BdaSolver
+    /// \param[in] tolerance                  required relative tolerance for BdaSolver
+    BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance);


    /// Solve linear system, A*x = b
--- a/opm/simulators/linalg/bda/BdaResult.hpp
+++ b/opm/simulators/linalg/bda/BdaResult.hpp
@ -20,7 +20,7 @@
 #ifndef BDARESULT_HEADER_INCLUDED
 #define BDARESULT_HEADER_INCLUDED

-namespace Opm
+namespace bda
 {

 /// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -0,0 +1,94 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
+
+
+#include <string>
+#include <vector>
+#include <sys/time.h>
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+namespace bda
+{
+
+    using Opm::WellContributions;
+
+    /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
+    /// This class is abstract, no instantiations can of it can be made, only of its children
+    class BdaSolver
+    {
+
+    protected:
+
+        // verbosity
+        // 0: print nothing during solves, only when initializing
+        // 1: print number of iterations and final norm
+        // 2: also print norm each iteration
+        // 3: also print timings of different backend functions
+
+        int verbosity = 0;
+
+        int maxit = 200;
+        double tolerance = 1e-2;
+
+        int N;           // number of rows
+        int Nb;          // number of blocked rows (Nb*block_size == N)
+        int nnz;         // number of nonzeroes (scalars)
+        int nnzb;        // number of nonzero blocks (nnzb*block_size*block_size == nnz)
+        int block_size;  // size of block
+
+        bool initialized = false;
+
+    public:
+
+        enum class BdaSolverStatus {
+            BDA_SOLVER_SUCCESS,
+            BDA_SOLVER_ANALYSIS_FAILED,
+            BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
+            BDA_SOLVER_UNKNOWN_ERROR
+        };
+
+        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
+
+        /// Define virtual destructor, so that the derivedclass destructor will be called
+        virtual ~BdaSolver() {};
+
+        /// Define as pure virtual functions, so derivedclass must implement them
+        virtual BdaSolverStatus solve_system(int N, int nnz, int dim,
+            double *vals, int *rows, int *cols,
+            double *b, WellContributions& wellContribs, BdaResult &res) = 0;
+
+        virtual void get_result(double *x) = 0;
+
+        /// Different implementations of BdaSolver can use this function for timing
+        static double second(void) {
+            struct timeval tv;
+            gettimeofday(&tv, nullptr);
+            return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+        }
+
+    }; // end class BdaSolver
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -0,0 +1,156 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <sys/time.h>
+#include <cmath>
+#include <iostream>
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+using bda::BlockedMatrix;
+
+namespace bda
+{
+
+BlockedMatrix *allocateBlockedMatrix(int Nb, int nnzbs) {
+    BlockedMatrix *mat = new BlockedMatrix();
+
+    mat->nnzValues  = new Block[nnzbs];
+    mat->colIndices = new int[nnzbs];
+    mat->rowPointers = new int[Nb + 1];
+    mat->Nb = Nb;
+    mat->nnzbs = nnzbs;
+
+    return mat;
+}
+
+void freeBlockedMatrix(BlockedMatrix **mat) {
+    if (*mat) {
+        delete[] (*mat)->nnzValues;
+        delete[] (*mat)->colIndices;
+        delete[] (*mat)->rowPointers;
+        delete (*mat);
+        *mat = NULL;
+    }
+}
+
+BlockedMatrix *soft_copyBlockedMatrix(BlockedMatrix *mat) {
+    BlockedMatrix *res = new BlockedMatrix();
+    res->nnzValues = mat->nnzValues;
+    res->colIndices = mat->colIndices;
+    res->rowPointers = mat->rowPointers;
+    res->Nb = mat->Nb;
+    res->nnzbs = mat->nnzbs;
+    return res;
+}
+
+
+/*Sort a row of matrix elements from a blocked CSR-format.*/
+
+void sortBlockedRow(int *colIndices, Block *data, int left, int right) {
+    int l = left;
+    int r = right;
+    int middle = colIndices[(l + r) >> 1];
+    double lDatum[BLOCK_SIZE * BLOCK_SIZE];
+    do {
+        while (colIndices[l] < middle)
+            l++;
+        while (colIndices[r] > middle)
+            r--;
+        if (l <= r) {
+            int lColIndex = colIndices[l];
+            colIndices[l] = colIndices[r];
+            colIndices[r] = lColIndex;
+            memcpy(lDatum, data + l, sizeof(double) * BLOCK_SIZE * BLOCK_SIZE);
+            memcpy(data + l, data + r, sizeof(double) * BLOCK_SIZE * BLOCK_SIZE);
+            memcpy(data + r, lDatum, sizeof(double) * BLOCK_SIZE * BLOCK_SIZE);
+
+            l++;
+            r--;
+        }
+    } while (l < r);
+
+    if (left < r)
+        sortBlockedRow(colIndices, data, left, r);
+
+    if (right > l)
+        sortBlockedRow(colIndices, data, l, right);
+}
+
+
+// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
+// a = a - (b * c)
+void blockMultSub(Block a, Block b, Block c)
+{
+    for (int row = 0; row < BLOCK_SIZE; row++) {
+        for (int col = 0; col < BLOCK_SIZE; col++) {
+            double temp = 0.0;
+            for (int k = 0; k < BLOCK_SIZE; k++) {
+                temp += b[BLOCK_SIZE * row + k] * c[BLOCK_SIZE * k + col];
+            }
+            a[BLOCK_SIZE * row + col] -= temp;
+        }
+    }
+}
+
+/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
+
+void blockMult(Block mat1, Block mat2, Block resMat) {
+    for (int row = 0; row < BLOCK_SIZE; row++) {
+        for (int col = 0; col < BLOCK_SIZE; col++) {
+            double temp = 0;
+            for (int k = 0; k < BLOCK_SIZE; k++) {
+                temp += mat1[BLOCK_SIZE * row + k] * mat2[BLOCK_SIZE * k + col];
+            }
+            resMat[BLOCK_SIZE * row + col] = temp;
+        }
+    }
+}
+
+
+/* Calculate the inverse of a block. This function is specific for only 3x3 block size.*/
+
+void blockInvert3x3(Block mat, Block res) {
+    // code generated by maple, copied from DUNE
+    double t4  = mat[0] * mat[4];
+    double t6  = mat[0] * mat[5];
+    double t8  = mat[1] * mat[3];
+    double t10 = mat[2] * mat[3];
+    double t12 = mat[1] * mat[6];
+    double t14 = mat[2] * mat[6];
+
+    double det = (t4 * mat[8] - t6 * mat[7] - t8 * mat[8] +
+                  t10 * mat[7] + t12 * mat[5] - t14 * mat[4]);
+    double t17 = 1.0 / det;
+
+    res[0] =  (mat[4] * mat[8] - mat[5] * mat[7]) * t17;
+    res[1] = -(mat[1] * mat[8] - mat[2] * mat[7]) * t17;
+    res[2] =  (mat[1] * mat[5] - mat[2] * mat[4]) * t17;
+    res[3] = -(mat[3] * mat[8] - mat[5] * mat[6]) * t17;
+    res[4] =  (mat[0] * mat[8] - t14) * t17;
+    res[5] = -(t6 - t10) * t17;
+    res[6] =  (mat[3] * mat[7] - mat[4] * mat[6]) * t17;
+    res[7] = -(mat[0] * mat[7] - t12) * t17;
+    res[8] =  (t4 - t8) * t17;
+}
+
+
+} // end namespace bda
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -0,0 +1,80 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BLOCKED_MATRIX_HPP
+#define BLOCKED_MATRIX_HPP
+
+#define BLOCK_SIZE 3
+
+typedef double Block[9];
+
+namespace bda
+{
+
+typedef struct {
+    Block *nnzValues;
+    int *colIndices;
+    int *rowPointers;
+    int Nb;
+    int nnzbs;
+} BlockedMatrix;
+
+/// Allocate BlockedMatrix and data arrays with given sizes
+/// \param[in] Nb               number of blockrows
+/// \param[in] nnzbs            number of nonzero blocks
+/// \return pointer to BlockedMatrix
+BlockedMatrix *allocateBlockedMatrix(int Nb, int nnzbs);
+
+/// Allocate BlockedMatrix, but copy data pointers instead of allocating new memory
+/// \param[in] mat              matrix to be copied
+/// \return pointer to BlockedMatrix
+BlockedMatrix *soft_copyBlockedMatrix(BlockedMatrix *mat);
+
+/// Free BlockedMatrix and its data
+/// \param[in] mat              matrix to be free'd
+void freeBlockedMatrix(BlockedMatrix **mat);
+
+/// Sort a row of matrix elements from a blocked CSR-format
+/// \param[inout] colIndices     
+/// \param[inout] data           
+/// \param[in] left              lower index of data of row
+/// \param[in] right             upper index of data of row
+void sortBlockedRow(int *colIndices, Block *data, int left, int right);
+
+/// Multiply and subtract blocks
+/// a = a - (b * c)
+/// \param[inout] a              block to be subtracted from
+/// \param[in] b                 input block
+/// \param[in] c                 input block
+void blockMultSub(Block a, Block b, Block c);
+
+/// Perform a 3x3 matrix-matrix multiplication on two blocks
+/// \param[in] mat1              input block 1
+/// \param[in] mat2              input block 2
+/// \param[inout] resMat         output block
+void blockMult(Block mat1, Block mat2, Block resMat);
+
+/// Calculate the inverse of a block. This function is specific for only 3x3 block size.
+/// \param[in] mat               input block
+/// \param[inout] res            output block
+void blockInvert3x3(Block mat, Block res);
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
@ -30,94 +30,108 @@
 namespace Opm
 {

-        MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
-            unsigned int Nb_, unsigned int Mb_,
-            unsigned int BnumBlocks_, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-            unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
-            std::vector<double> &Cvalues)
-        :
-            dim(dim_),                // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
-            dim_wells(dim_wells_),    // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
-            N(Nb_*dim),               // number of rows in vectors x and y, N == dim*Nb
-            Nb(Nb_),                  // number of blockrows in x and y
-            M(Mb_*dim_wells),         // number of rows, M == dim_wells*Mb
-            Mb(Mb_),                  // number of blockrows in C, D and B
-            DnumBlocks(DnumBlocks_),  // number of blocks in D
-            BnumBlocks(BnumBlocks_),  // number of blocks in C and B
-            // copy data for matrix D into vectors to prevent it going out of scope
-            Dvals(Dvalues, Dvalues + DnumBlocks*dim_wells*dim_wells),
-            Dcols(DcolPointers, DcolPointers + M + 1),
-            Drows(DrowIndices, DrowIndices + DnumBlocks*dim_wells*dim_wells)
-        {
-            Cvals = std::move(Cvalues);
-            Bvals = std::move(Bvalues);
-            Bcols = std::move(BcolIndices);
-            Brows = std::move(BrowPointers);
+MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
+        unsigned int Nb_, unsigned int Mb_,
+        unsigned int BnumBlocks_, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+        unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
+        std::vector<double> &Cvalues)
+    :
+    dim(dim_),                // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
+    dim_wells(dim_wells_),    // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
+    N(Nb_ * dim),             // number of rows in vectors x and y, N == dim*Nb
+    Nb(Nb_),                  // number of blockrows in x and y
+    M(Mb_ * dim_wells),       // number of rows, M == dim_wells*Mb
+    Mb(Mb_),                  // number of blockrows in C, D and B
+    DnumBlocks(DnumBlocks_),  // number of blocks in D
+    BnumBlocks(BnumBlocks_),  // number of blocks in C and B
+    // copy data for matrix D into vectors to prevent it going out of scope
+    Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells),
+    Dcols(DcolPointers, DcolPointers + M + 1),
+    Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
+{
+    Cvals = std::move(Cvalues);
+    Bvals = std::move(Bvalues);
+    Bcols = std::move(BcolIndices);
+    Brows = std::move(BrowPointers);

-            z1.resize(Mb * dim_wells);
-            z2.resize(Mb * dim_wells);
+    z1.resize(Mb * dim_wells);
+    z2.resize(Mb * dim_wells);

-            umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
-            umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
-        }
+    umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
+    umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
+}

-        MultisegmentWellContribution::~MultisegmentWellContribution()
-        {
-            umfpack_di_free_symbolic(&UMFPACK_Symbolic);
-            umfpack_di_free_numeric(&UMFPACK_Numeric);
-        }
+MultisegmentWellContribution::~MultisegmentWellContribution()
+{
+    umfpack_di_free_symbolic(&UMFPACK_Symbolic);
+    umfpack_di_free_numeric(&UMFPACK_Numeric);
+}


-        // Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
-        // h_x and h_y reside on host
-        // y -= (C^T * (D^-1 * (B * x)))
-        void MultisegmentWellContribution::apply(double *h_x, double *h_y)
-        {
-            // reset z1 and z2
-            std::fill(z1.begin(), z1.end(), 0.0);
-            std::fill(z2.begin(), z2.end(), 0.0);
+// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
+// h_x and h_y reside on host
+// y -= (C^T * (D^-1 * (B * x)))
+void MultisegmentWellContribution::apply(double *h_x, double *h_y)
+{
+    // reset z1 and z2
+    std::fill(z1.begin(), z1.end(), 0.0);
+    std::fill(z2.begin(), z2.end(), 0.0);

-            // z1 = B * x
-            for (unsigned int row = 0; row < Mb; ++row) {
-                // for every block in the row
-                for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
-                    unsigned int colIdx = Bcols[blockID];
-                    for (unsigned int j = 0; j < dim_wells; ++j) {
-                        double temp = 0.0;
-                        for (unsigned int k = 0; k < dim; ++k) {
-                            temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
-                        }
-                        z1[row * dim_wells + j] += temp;
-                    }
-                }
-            }
-
-            // z2 = D^-1 * (B * x)
-            // umfpack
-            umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
-
-            // y -= (C^T * z2)
-            // y -= (C^T * (D^-1 * (B * x)))
-            for (unsigned int row = 0; row < Mb; ++row) {
-                // for every block in the row
-                for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
-                    unsigned int colIdx = Bcols[blockID];
-                    for (unsigned int j = 0; j < dim; ++j) {
-                        double temp = 0.0;
-                        for (unsigned int k = 0; k < dim_wells; ++k) {
-                            temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
-                        }
-                        h_y[colIdx * dim + j] -= temp;
-                    }
+    // z1 = B * x
+    for (unsigned int row = 0; row < Mb; ++row) {
+        // for every block in the row
+        for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
+            unsigned int colIdx = getColIdx(Bcols[blockID]);
+            for (unsigned int j = 0; j < dim_wells; ++j) {
+                double temp = 0.0;
+                for (unsigned int k = 0; k < dim; ++k) {
+                    temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
                }
+                z1[row * dim_wells + j] += temp;
            }
        }
+    }

-        void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
-        {
-            stream = stream_;
+    // z2 = D^-1 * (B * x)
+    // umfpack
+    umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
+
+    // y -= (C^T * z2)
+    // y -= (C^T * (D^-1 * (B * x)))
+    for (unsigned int row = 0; row < Mb; ++row) {
+        // for every block in the row
+        for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
+            unsigned int colIdx = getColIdx(Bcols[blockID]);
+            for (unsigned int j = 0; j < dim; ++j) {
+                double temp = 0.0;
+                for (unsigned int k = 0; k < dim_wells; ++k) {
+                    temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
+                }
+                h_y[colIdx * dim + j] -= temp;
+            }
        }
+    }
+}

+void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
+{
+    stream = stream_;
+}
+
+unsigned int MultisegmentWellContribution::getColIdx(unsigned int idx)
+{
+    if (reorder) {
+        return toOrder[idx];
+    } else {
+        return idx;
+    }
+}
+
+void MultisegmentWellContribution::setReordering(int *toOrder, bool reorder)
+{
+    this->toOrder = toOrder;
+    this->reorder = reorder;
+}

 } //namespace Opm

--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@ -24,85 +24,101 @@

 #include <vector>

+#if HAVE_CUDA
 #include <cuda_runtime.h>
+#endif

 namespace Opm
 {

-    /// This class serves to duplicate the functionality of the MultisegmentWell
-    /// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
-    /// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
-    /// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq. 
-    /// B*x and D*B*x are a vector with M*numWellEq doubles
-    /// C*D*B*x is a vector with N*numEq doubles.
-    
-    class MultisegmentWellContribution
-    {
+/// This class serves to duplicate the functionality of the MultisegmentWell
+/// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
+/// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
+/// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq.
+/// B*x and D*B*x are a vector with M*numWellEq doubles
+/// C*D*B*x is a vector with N*numEq doubles.

-    private:
-        unsigned int dim;                        // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
-        unsigned int dim_wells;                  // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
-        unsigned int N;                          // number of rows in vectors x and y, N == dim*Nb
-        unsigned int Nb;                         // number of blockrows in x and y
-        unsigned int M;                          // number of rows, M == dim_wells*Mb
-        unsigned int Mb;                         // number of blockrows in C, D and B
+class MultisegmentWellContribution
+{

-        cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
+private:
+    unsigned int dim;                        // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
+    unsigned int dim_wells;                  // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
+    unsigned int N;                          // number of rows in vectors x and y, N == dim*Nb
+    unsigned int Nb;                         // number of blockrows in x and y
+    unsigned int M;                          // number of rows, M == dim_wells*Mb
+    unsigned int Mb;                         // number of blockrows in C, D and B

-        // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
-        // Sparsity pattern for C is not stored, since it is the same as B
-        unsigned int DnumBlocks;             // number of blocks in D
-        unsigned int BnumBlocks;             // number of blocks in C and B
-        std::vector<double> Cvals;
-        std::vector<double> Dvals;
-        std::vector<double> Bvals;
-        std::vector<int> Dcols;              // Columnpointers, contains M+1 entries
-        std::vector<unsigned int> Bcols;
-        std::vector<int> Drows;              // Rowindicies, contains DnumBlocks*dim*dim_wells entries
-        std::vector<unsigned int> Brows;
-        std::vector<double> z1;          // z1 = B * x
-        std::vector<double> z2;          // z2 = D^-1 * B * x
-        void *UMFPACK_Symbolic, *UMFPACK_Numeric;
+#if HAVE_CUDA
+    cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
+#endif

+    // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
+    // Sparsity pattern for C is not stored, since it is the same as B
+    unsigned int DnumBlocks;             // number of blocks in D
+    unsigned int BnumBlocks;             // number of blocks in C and B
+    std::vector<double> Cvals;
+    std::vector<double> Dvals;
+    std::vector<double> Bvals;
+    std::vector<int> Dcols;              // Columnpointers, contains M+1 entries
+    std::vector<unsigned int> Bcols;
+    std::vector<int> Drows;              // Rowindicies, contains DnumBlocks*dim*dim_wells entries
+    std::vector<unsigned int> Brows;
+    std::vector<double> z1;          // z1 = B * x
+    std::vector<double> z2;          // z2 = D^-1 * B * x
+    void *UMFPACK_Symbolic, *UMFPACK_Numeric;

-    public:
+    int *toOrder = nullptr;
+    bool reorder = false;

-        /// Set a cudaStream to be used
-        /// \param[in] stream           the cudaStream that is used
-        void setCudaStream(cudaStream_t stream);
+    /// Translate the columnIndex if needed
+    /// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well
+    unsigned int getColIdx(unsigned int idx);

-        /// Create a new MultisegmentWellContribution
-        /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
-        /// The variables representing C, B and D will go out of scope when MultisegmentWell::addWellContribution() ends
-        /// \param[in] dim              size of blocks in blockvectors x and y, equal to MultisegmentWell::numEq
-        /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
-        /// \param[in] Nb               number of blocks in vectors x and y
-        /// \param[in] Mb               number of blockrows in C, B and D
-        /// \param[in] BnumBlocks       number of blocks in C and B
-        /// \param[in] Bvalues          nonzero values of matrix B
-        /// \param[in] BcolIndices      columnindices of blocks of matrix B
-        /// \param[in] BrowPointers     rowpointers of matrix B
-        /// \param[in] DnumBlocks       number of blocks in D
-        /// \param[in] Dvalues          nonzero values of matrix D
-        /// \param[in] DcolPointers     columnpointers of matrix D
-        /// \param[in] DrowIndices      rowindices of matrix D
-        /// \param[in] Cvalues          nonzero values of matrix C
-        MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
-            unsigned int Nb, unsigned int Mb,
-            unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-            unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
-            std::vector<double> &Cvalues);
+public:

-        /// Destroy a MultisegmentWellContribution, and free memory
-        ~MultisegmentWellContribution();
+#if HAVE_CUDA
+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used
+    void setCudaStream(cudaStream_t stream);
+#endif

-        /// Apply the MultisegmentWellContribution on CPU
-        /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
-        /// \param[in] h_x          vector x, must be on CPU
-        /// \param[inout] h_y       vector y, must be on CPU
-        void apply(double *h_x, double *h_y);
+    /// Create a new MultisegmentWellContribution
+    /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
+    /// The variables representing C, B and D will go out of scope when MultisegmentWell::addWellContribution() ends
+    /// \param[in] dim              size of blocks in blockvectors x and y, equal to MultisegmentWell::numEq
+    /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
+    /// \param[in] Nb               number of blocks in vectors x and y
+    /// \param[in] Mb               number of blockrows in C, B and D
+    /// \param[in] BnumBlocks       number of blocks in C and B
+    /// \param[in] Bvalues          nonzero values of matrix B
+    /// \param[in] BcolIndices      columnindices of blocks of matrix B
+    /// \param[in] BrowPointers     rowpointers of matrix B
+    /// \param[in] DnumBlocks       number of blocks in D
+    /// \param[in] Dvalues          nonzero values of matrix D
+    /// \param[in] DcolPointers     columnpointers of matrix D
+    /// \param[in] DrowIndices      rowindices of matrix D
+    /// \param[in] Cvalues          nonzero values of matrix C
+    MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+                                 unsigned int Nb, unsigned int Mb,
+                                 unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+                                 unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+                                 std::vector<double> &Cvalues);

-    };
+    /// Destroy a MultisegmentWellContribution, and free memory
+    ~MultisegmentWellContribution();
+
+    /// Apply the MultisegmentWellContribution on CPU
+    /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
+    /// \param[in] h_x          vector x, must be on CPU
+    /// \param[inout] h_y       vector y, must be on CPU
+    void apply(double *h_x, double *h_y);
+
+    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    void setReordering(int *toOrder, bool reorder);
+};

 } //namespace Opm

--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@ -0,0 +1,343 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cstdio>
+#include <vector>
+#include <cstring>
+#include <algorithm> // for fill()
+#include <sys/time.h>
+
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+namespace bda
+{
+
+
+/* Give every node in the matrix (of which only the sparsity pattern in the
+ * form of row pointers and column indices arrays are in the input), a color
+ * in the colors array. Also return the amount of colors in the return integer. */
+
+int colorBlockedNodes(int rows, const int *rowPointers, const int *colIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor)
+{
+    int left, c, t, i, j, k;
+    const int max_tries = 100;            // since coloring is random, it is possible that a coloring fails. In that case, try again.
+    std::vector<int> randoms;
+    randoms.reserve(rows);
+
+    std::vector<bool> visitedColumns;
+    std::fill(visitedColumns.begin(), visitedColumns.end(), false);
+    int colsInColor;
+    int additionalColsInRow;
+
+    for (t = 0; t < max_tries; t++) {
+        struct timeval tm;
+        gettimeofday(&tm, nullptr);
+        srand(tm.tv_sec + tm.tv_usec * 1000000ul);
+        for (i = 0; i  < rows; i++) {
+            randoms[i] = rand();
+            colors[i] = -1;
+        }
+
+        for (c = 0; c < MAX_COLORS; c++) {
+            int rowsInColor = 0;
+            colsInColor = 0;
+            for (i = 0; i < rows; i++)
+            {
+                char f = 1; // true iff you have max random
+
+                // ignore nodes colored earlier
+                if ((colors[i] != -1))
+                    continue;
+
+                int ir = randoms[i];
+
+                // look at neighbors to check their random number
+                for (k = rowPointers[i]; k < rowPointers[i + 1]; k++) {
+
+                    // ignore nodes colored earlier (and yourself)
+                    j = colIndices[k];
+                    int jc = colors[j];
+                    if (((jc != -1) && (jc != c)) || (i == j)) {
+                        continue;
+                    }
+                    // The if statement below makes it both true graph coloring and no longer guaranteed to converge
+                    if (jc == c) {
+                        f = 0;
+                        break;
+                    }
+                    int jr = randoms[j];
+                    if (ir <= jr) f = 0;
+                }
+
+                // assign color if you have the maximum random number
+                if (f == 1) {
+                    additionalColsInRow = 0;
+                    for (k = rowPointers[i]; k < rowPointers[i + 1]; k++) {
+                        j = colIndices[k];
+                        if (!visitedColumns[j]) {
+                            visitedColumns[j] = true;
+                            additionalColsInRow += 3;
+                        }
+                    }
+                    if ((colsInColor + additionalColsInRow) > maxColsPerColor) {
+                        break;
+                    }
+                    colsInColor += additionalColsInRow;
+                    colors[i] = c;
+                    rowsInColor += 3;
+                    if ((rowsInColor + 2) >= maxRowsPerColor) {
+                        break;
+                    }
+
+                }
+
+            }
+            // Check if graph coloring is done.
+            left = 0;
+            for (k = 0; k < rows; k++) {
+                if (colors[k] == -1) {
+                    left++;
+                }
+            }
+            if (left == 0) {
+                return c + 1;
+            }
+        }
+    }
+
+    printf("Could not find a graph coloring with %d colors after %d tries.\nAmount of colorless nodes: %d.\n", c, t, left);
+    return -1;
+}
+
+
+/* Reorder a matrix by a specified input order.
+ * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
+ * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
+
+void blocked_reorder_matrix_by_pattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rMat) {
+    int rIndex = 0;
+    int i, j, k;
+
+    rMat->rowPointers[0] = 0;
+    for (i = 0; i < mat->Nb; i++) {
+        int thisRow = fromOrder[i];
+        // put thisRow from the old matrix into row i of the new matrix
+        rMat->rowPointers[i + 1] = rMat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
+        for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
+            for (j = 0; j < BLOCK_SIZE * BLOCK_SIZE; j++)
+                rMat->nnzValues[rIndex][j] = mat->nnzValues[k][j];
+            rMat->colIndices[rIndex] = mat->colIndices[k];
+            rIndex++;
+        }
+    }
+    // re-assign column indices according to the new positions of the nodes referenced by the column indices
+    for (i = 0; i < mat->nnzbs; i++) {
+        rMat->colIndices[i] = toOrder[rMat->colIndices[i]];
+    }
+    // re-sort the column indices of every row.
+    for (i = 0; i < mat->Nb; i++) {
+        sortBlockedRow(rMat->colIndices, rMat->nnzValues, rMat->rowPointers[i], rMat->rowPointers[i + 1] - 1);
+    }
+}
+
+/* Reorder a matrix according to the colors that every node of the matrix has received*/
+
+void colorsToReordering(int Nb, std::vector<int>& colors, int *toOrder, int *fromOrder, int *iters) {
+    int reordered = 0;
+    int i, c;
+    for (i = 0; i < MAX_COLORS; i++) {
+        iters[i] = 0;
+    }
+
+    // Find reordering patterns
+    for (c = 0; c < MAX_COLORS; c++) {
+        for (i = 0; i < Nb; i++) {
+            if (colors[i] == c) {
+                iters[c]++;
+                toOrder[i] = reordered;
+
+                fromOrder[reordered] = i;
+                reordered++;
+            }
+        }
+    }
+}
+
+// Reorder a matrix according to a reordering pattern
+
+void blocked_reorder_vector_by_pattern(int Nb, double *vector, int *fromOrder, double *rVector) {
+    for (int i = 0; i < Nb; i++) {
+        for (int j = 0; j < BLOCK_SIZE; j++) {
+            rVector[BLOCK_SIZE * i + j] = vector[BLOCK_SIZE * fromOrder[i] + j];
+        }
+    }
+}
+
+
+/* Check is operations on a node in the matrix can be started
+ * A node can only be started if all nodes that it depends on during sequential execution have already completed.*/
+
+bool canBeStarted(int rowIndex, int *rowPointers, int *colIndices, std::vector<bool>& doneRows) {
+    bool canStart = !doneRows[rowIndex];
+    int i, thisDependency;
+    if (canStart) {
+        for (i = rowPointers[rowIndex]; i < rowPointers[rowIndex + 1]; i++) {
+            thisDependency = colIndices[i];
+            // Only dependencies on rows that should execute before the current one are relevant
+            if (thisDependency >= rowIndex)
+                break;
+            // Check if dependency has been resolved
+            if (!doneRows[thisDependency]) {
+                return false;
+            }
+        }
+    }
+    return canStart;
+}
+
+/*
+ * The level scheduling of a non-symmetric, blocked matrix requires access to a CSC encoding and a CSR encoding of the same matrix.
+*/
+
+int *findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCColIndices, int *CSCRowPointers, int Nb, int *iters, int *toOrder, int* fromOrder) {
+    int activeRowIndex = 0, iterEnd, nextActiveRowIndex = 0;
+    int thisRow;
+    std::vector<bool> doneRows(Nb, false);
+    std::vector<int> rowsPerIter;
+    rowsPerIter.reserve(Nb);
+    int *resRowsPerIter;
+
+    std::vector <int> rowsToStart;
+
+    // find starting rows: rows that are independent from all rows that come before them.
+    for (thisRow = 0; thisRow < Nb; thisRow++) {
+        if (canBeStarted(thisRow, CSRRowPointers, CSCColIndices, doneRows)) {
+            fromOrder[nextActiveRowIndex] = thisRow;
+            toOrder[thisRow] = nextActiveRowIndex;
+            nextActiveRowIndex++;
+        }
+    }
+    // 'do' compute on all active rows
+    for (iterEnd = 0; iterEnd < nextActiveRowIndex; iterEnd++) {
+        doneRows[fromOrder[iterEnd]] = true;
+    }
+
+    rowsPerIter.emplace_back(nextActiveRowIndex - activeRowIndex);
+
+    while (iterEnd < Nb) {
+        // Go over all rows active from the last iteration, and check which of their neighbours can be activated this iteration
+        for (; activeRowIndex < iterEnd; activeRowIndex++) {
+            thisRow = fromOrder[activeRowIndex];
+
+            for (int i = CSCRowPointers[thisRow]; i < CSCRowPointers[thisRow + 1]; i++) {
+                int thatRow = CSCColIndices[i];
+
+                if (canBeStarted(thatRow, CSRRowPointers, CSRColIndices, doneRows)) {
+                    rowsToStart.emplace_back(thatRow);
+                }
+            }
+        }
+        // 'do' compute on all active rows
+        for (unsigned int i = 0; i < rowsToStart.size(); i++) {
+            thisRow = rowsToStart[i];
+            if (!doneRows[thisRow]) {
+                doneRows[thisRow] = true;
+                fromOrder[nextActiveRowIndex] = thisRow;
+                toOrder[thisRow] = nextActiveRowIndex;
+                nextActiveRowIndex++;
+            }
+        }
+        iterEnd = nextActiveRowIndex;
+        rowsPerIter.emplace_back(nextActiveRowIndex - activeRowIndex);
+    }
+    // Crop the rowsPerIter array to it minimum size.
+    int numColors = rowsPerIter.size();
+    resRowsPerIter = new int[numColors];
+    for (int i = 0; i < numColors; i++) {
+        resRowsPerIter[i] = rowsPerIter[i];
+    }
+
+    *iters = rowsPerIter.size();
+
+    return resRowsPerIter;
+}
+
+/* Perform the complete graph coloring algorithm on a matrix. Return an array with the amount of nodes per color.*/
+
+int* findGraphColoring(int *colIndices, int *rowPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int* fromOrder) {
+    std::vector<int> rowColor;
+    rowColor.reserve(Nb);
+    int *rowsPerColor = new int[MAX_COLORS];
+
+    if (colorBlockedNodes(Nb, rowPointers, colIndices, rowColor, maxRowsPerColor, maxColsPerColor) == -1) {
+        return nullptr;
+    }
+
+    colorsToReordering(Nb, rowColor, toOrder, fromOrder, rowsPerColor);
+
+    *numColors = MAX_COLORS;
+    while (rowsPerColor[*numColors - 1] == 0) {
+        *numColors = *numColors - 1;
+    }
+
+    return rowsPerColor;
+}
+
+// based on the scipy package from python, scipy/sparse/sparsetools/csr.h on github
+// input : matrix A via Avals, Acols, Arows, Nb
+// output: matrix B via Bvals, Bcols, Brows
+// arrays for B must be preallocated
+void bcsr_to_bcsc(Block *Avals, int *Acols, int *Arows, Block *Bvals, int *Bcols, int *Brows, int Nb) {
+
+    int nnz = Arows[Nb];
+
+    // compute number of nnzs per column
+    std::fill(Brows, Brows + Nb, 0);
+
+    for (int n = 0; n < nnz; ++n) {
+        Brows[Acols[n]]++;
+    }
+
+    // cumsum the nnz per col to get Brows
+    for (int col = 0, cumsum = 0; col < Nb; ++col) {
+        int temp = Brows[col];
+        Brows[col] = cumsum;
+        cumsum += temp;
+    }
+    Brows[Nb] = nnz;
+
+    for (int row = 0; row < Nb; ++row) {
+        for (int j = Arows[row]; j < Arows[row + 1]; ++j) {
+            int col = Acols[j];
+            int dest = Brows[col];
+            Bcols[dest] = row;
+            memcpy(Bvals + dest, Avals + dest, sizeof(Block));
+            Brows[col]++;
+        }
+    }
+
+    for (int col = 0, last = 0; col <= Nb; ++col) {
+        int temp = Brows[col];
+        Brows[col] = last;
+        last = temp;
+    }
+}
+
+} //namespace bda
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@ -0,0 +1,116 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef REORDER_HPP
+#define REORDER_HPP
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+namespace bda
+{
+
+#define MAX_COLORS 256
+
+/// Give every node in the matrix a color so that no neighbouring nodes share a color
+/// The color array must be allocated already
+/// This function does graph coloring based on random numbers
+/// \param[in] rows            number of rows in the matrix
+/// \param[in] rowPointers     array of row pointers
+/// \param[in] colIndices      array of column indices
+/// \param[inout] colors       output array containing the number of the color that each row is assigned to
+/// \param[in] maxRowsPerColor the maximum number of rows that are allowed in one color (so: the maximum number of nodes per color)
+/// \param[in] maxColsPerColor the maximum number of columns that the rows in a color are allowed to share (so: the maximum number of nodes that the nodes in one color may be connected to)
+/// \return                    the number of colors needed for the coloring, or -1 if no reordering was found with the given restrictions
+int colorBlockedNodes(int rows, const int *rowPointers, const int *colIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);
+
+/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
+/// rMat must be allocated already
+/// \param[in] mat           matrix to be reordered
+/// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rMat       reordered Matrix 
+void blocked_reorder_matrix_by_pattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rMat);
+
+/// Compute reorder mapping from the color that each node has received
+/// The toOrder, fromOrder and iters arrays must be allocated already
+/// \param[in] Nb            number of blocks in the vector
+/// \param[in] colors        array containing the number of the color that each row is assigned to
+/// \param[inout] toOrder    reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[inout] fromOrder  reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] iters      array containing for each color the number of rows that it contains
+void colorsToReordering(int Nb, std::vector<int>& colors, int *toOrder, int *fromOrder, int *iters);
+
+/// Reorder a vector according to the mapping in toOrder and fromOrder
+/// The rVector array must be allocated already
+/// \param[in] Nb            number of blocks in the vector
+/// \param[in] vector        vector to be reordered
+/// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
+/// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
+/// \param[inout] rVector    reordered vector
+void blocked_reorder_vector_by_pattern(int Nb, double *vector, int *fromOrder, double *rVector);
+
+/// Determine whether all rows that a certain row depends on are done already
+/// \param[in] rowIndex      index of the row that needs to be checked for
+/// \param[in] rowPointers   row pointers of the matrix that the row is in
+/// \param[in] colIndices    column indices of the matrix that the row is in
+/// \param[in] doneRows      array that for each row lists whether it is done or not
+/// \return                  true iff all dependencies are done and if the result itself was not done yet
+bool canBeStarted(int rowIndex, int *rowPointers, int *colIndices, std::vector<bool>& doneRows);
+
+/// Find a level scheduling reordering for an input matrix
+/// The toOrder and fromOrder arrays must be allocated already
+/// \param[in] CSRColIndices  column indices array, obtained from storing the input matrix in the CSR format
+/// \param[in] CSRRowPointers row pointers array, obtained from storing the input matrix in the CSR format
+/// \param[in] CSCColIndices  row indices array, obtained from storing the input matrix in the CSC format
+/// \param[in] CSCRowPointers column pointers array, obtained from storing the input matrix in the CSC format
+/// \param[in] Nb             number of blockrows in the matrix
+/// \param[out] iters         a pointer to the number of colors needed for the level scheduling
+/// \param[inout] toOrder     the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
+/// \param[inout] fromOrder   the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
+/// \return                   a pointer to an array that contains for each color, the number of rows that that color contains
+int* findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCColIndices, int *CSCRowPointers, int Nb, int *iters, int *toOrder, int* fromOrder);
+
+/// Find a graph coloring reordering for an input matrix
+/// The toOrder and fromOrder arrays must be allocated already
+/// \param[in] colIndices      column indices of the input matrix
+/// \param[in] rowPointers     row pointers of the input matrix
+/// \param[in] Nb              number of blockrows in the matrix
+/// \param[in] maxRowsPerColor the maximum number of rows that are allowed in one color (so: the maximum number of nodes per color)
+/// \param[in] maxColsPerColor the maximum number of columns that the rows in a color are allowed to share (so: the maximum number of nodes that the nodes in one color may be connected to)
+/// \param[out] numColors      the number of colors used in the found graph coloring
+/// \param[inout] toOrder      the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
+/// \param[inout] fromOrder    the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
+/// \return                    a pointer to an array that contains for each color, the number of rows that that color contains
+int* findGraphColoring(int *colIndices, int *rowPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int* fromOrder);
+
+/// Convert BCSR matrix to BCSC
+/// Arrays for output matrix B must be allocated already
+/// Based on the csr_tocsc() function from the scipy package from python, https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h
+/// \param[in] Avals          non-zero values of the BCSR matrix
+/// \param[in] Acols          column indices of the BCSR matrix
+/// \param[in] Arows          row pointers of the BCSR matrix
+/// \param[inout] Bvals       non-zero values of the result BCSC matrix
+/// \param[inout] Bcols       row indices of the result BCSC matrix
+/// \param[inout] Brows       column pointers of the result BCSC matrix
+/// \param[in] Nb             number of blockrows in the matrix
+void bcsr_to_bcsc(Block *Avals, int *Acols, int *Arows, Block *Bvals, int *Bcols, int *Brows, int Nb);
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@ -34,248 +34,302 @@
 namespace Opm
 {

-    // apply WellContributions using y -= C^T * (D^-1 * (B * x))
-    __global__ void apply_well_contributions(
-        const double * __restrict__ Cnnzs,
-        const double * __restrict__ Dnnzs,
-        const double * __restrict__ Bnnzs,
-        const int * __restrict__ Ccols,
-        const int * __restrict__ Bcols,
-        const double * __restrict__ x,
-        double * __restrict__ y,
-        const int dim,
-        const int dim_wells,
-        const unsigned int * __restrict__ val_pointers
-        )
-    {
-        const int idx_b = blockIdx.x;
-        const int idx_t = threadIdx.x;
-        const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
+// apply WellContributions using y -= C^T * (D^-1 * (B * x))
+__global__ void apply_well_contributions(
+    const double * __restrict__ Cnnzs,
+    const double * __restrict__ Dnnzs,
+    const double * __restrict__ Bnnzs,
+    const int * __restrict__ Ccols,
+    const int * __restrict__ Bcols,
+    const double * __restrict__ x,
+    double * __restrict__ y,
+    const int dim,
+    const int dim_wells,
+    const unsigned int * __restrict__ val_pointers
+)
+{
+    const int idx_b = blockIdx.x;
+    const int idx_t = threadIdx.x;
+    const unsigned int val_size = val_pointers[idx_b + 1] - val_pointers[idx_b];

-        const int vals_per_block = dim * dim_wells;        // 12
-        const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
-        const int num_blocks_per_warp = 32/vals_per_block; // 2
-        const int lane = idx_t % 32;
-        const int c = lane % dim;                           // col in block
-        const int r = (lane / dim) % dim_wells;             // row in block
+    const int vals_per_block = dim * dim_wells;        // 12
+    const int num_active_threads = (32 / vals_per_block) * vals_per_block; // 24
+    const int num_blocks_per_warp = 32 / vals_per_block; // 2
+    const int lane = idx_t % 32;
+    const int c = lane % dim;                           // col in block
+    const int r = (lane / dim) % dim_wells;             // row in block

-        extern __shared__ double smem[];
-        double * __restrict__ z1 = smem;
-        double * __restrict__ z2 = z1 + dim_wells;
-
-        if(idx_t < dim_wells){
-            z1[idx_t] = 0.0;
-        }
-
-        __syncthreads();
-
-        // z1 = B * x
-        if(idx_t < num_active_threads){
-            // multiply all blocks with x
-            double temp = 0.0;
-            int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
-            while(b < val_size + val_pointers[idx_b]){
-                int colIdx = Bcols[b];
-                temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
-                b += num_blocks_per_warp;
-            }
-
-            // merge all blocks into 1 dim*dim_wells block
-            // since NORNE has only 2 parallel blocks, do not use a loop
-            temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
-
-            b = idx_t / vals_per_block + val_pointers[idx_b];
-
-            // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
-            if(idx_t < vals_per_block){
-                // should be a loop as well, now only works for dim == 3
-                if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);}  // add col 2 to col 0
-                if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);}  // add col 1 to col 0
-            }
-
-            // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
-            if(c == 0 && idx_t < vals_per_block){
-                z1[r] = temp;
-            }
-        }
-
-        __syncthreads();
-
-        // z2 = D^-1 * B * x = D^-1 * z1
-        if(idx_t < dim_wells){
-            double temp = 0.0;
-            for(int c = 0; c < dim_wells; ++c){
-                temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
-            }
-            z2[idx_t] = temp;
-        }
-
-        __syncthreads();
-
-        // y -= C^T * D^-1 * B * x
-        // use dim * val_size threads, each block is assigned 'dim' threads
-        if(idx_t < dim * val_size){
-            double temp = 0.0;
-            int b = idx_t / dim + val_pointers[idx_b];
-            int cc = idx_t % dim;
-            int colIdx = Ccols[b];
-            for(unsigned int c = 0; c < dim_wells; ++c){
-                temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
-            }
-            y[colIdx * dim + cc] -= temp;
-        }
+    extern __shared__ double smem[];
+    double * __restrict__ z1 = smem;
+    double * __restrict__ z2 = z1 + dim_wells;

+    if (idx_t < dim_wells) {
+        z1[idx_t] = 0.0;
    }

-    void WellContributions::alloc()
-    {
-        if (num_std_wells > 0) {
-            cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-            cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-            cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-            cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-            cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-            val_pointers = new unsigned int[num_std_wells + 1];
-            cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
-            cudaCheckLastError("apply_gpu malloc failed");
-            allocated = true;
+    __syncthreads();
+
+    // z1 = B * x
+    if (idx_t < num_active_threads) {
+        // multiply all blocks with x
+        double temp = 0.0;
+        int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
+        while (b < val_size + val_pointers[idx_b]) {
+            int colIdx = Bcols[b];
+            temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
+            b += num_blocks_per_warp;
+        }
+
+        // merge all blocks into 1 dim*dim_wells block
+        // since NORNE has only 2 parallel blocks, do not use a loop
+        temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
+
+        b = idx_t / vals_per_block + val_pointers[idx_b];
+
+        // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
+        if (idx_t < vals_per_block) {
+            // should be a loop as well, now only works for dim == 3
+            if (c == 0 || c == 2) {temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
+            if (c == 0 || c == 1) {temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
+        }
+
+        // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
+        if (c == 0 && idx_t < vals_per_block) {
+            z1[r] = temp;
        }
    }

-    WellContributions::~WellContributions()
-    {
-        // free pinned memory for MultisegmentWellContributions
-        if (h_x) {
+    __syncthreads();
+
+    // z2 = D^-1 * B * x = D^-1 * z1
+    if (idx_t < dim_wells) {
+        double temp = 0.0;
+        for (int c = 0; c < dim_wells; ++c) {
+            temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
+        }
+        z2[idx_t] = temp;
+    }
+
+    __syncthreads();
+
+    // y -= C^T * D^-1 * B * x
+    // use dim * val_size threads, each block is assigned 'dim' threads
+    if (idx_t < dim * val_size) {
+        double temp = 0.0;
+        int b = idx_t / dim + val_pointers[idx_b];
+        int cc = idx_t % dim;
+        int colIdx = Ccols[b];
+        for (unsigned int c = 0; c < dim_wells; ++c) {
+            temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
+        }
+        y[colIdx * dim + cc] -= temp;
+    }
+
+}
+
+void WellContributions::alloc()
+{
+    if (num_std_wells > 0) {
+        cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+        cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
+        cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
+        cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
+        cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
+        val_pointers = new unsigned int[num_std_wells + 1];
+        cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
+        cudaCheckLastError("apply_gpu malloc failed");
+        allocated = true;
+    }
+}
+
+WellContributions::~WellContributions()
+{
+    // free pinned memory for MultisegmentWellContributions
+    if (h_x) {
+        if (host_mem_cuda) {
            cudaFreeHost(h_x);
            cudaFreeHost(h_y);
-        }
-
-        // delete MultisegmentWellContributions
-        for (auto ms : multisegments) {
-            delete ms;
-        }
-        multisegments.clear();
-
-        // delete data for StandardWell
-        if (num_std_wells > 0) {
-            cudaFree(d_Cnnzs);
-            cudaFree(d_Dnnzs);
-            cudaFree(d_Bnnzs);
-            cudaFree(d_Ccols);
-            cudaFree(d_Bcols);
-            delete[] val_pointers;
-            cudaFree(d_val_pointers);
+        } else {
+            delete[] h_x;
+            delete[] h_y;
        }
    }

+    // delete MultisegmentWellContributions
+    for (auto ms : multisegments) {
+        delete ms;
+    }
+    multisegments.clear();

-    // Apply the WellContributions, similar to StandardWell::apply()
-    // y -= (C^T *(D^-1*(   B*x)))
-    void WellContributions::apply(double *d_x, double *d_y)
-    {
-        // apply MultisegmentWells
+    // delete data for StandardWell
+    if (num_std_wells > 0) {
+        cudaFree(d_Cnnzs);
+        cudaFree(d_Dnnzs);
+        cudaFree(d_Bnnzs);
+        cudaFree(d_Ccols);
+        cudaFree(d_Bcols);
+        delete[] val_pointers;
+        cudaFree(d_val_pointers);
+    }
+}

-        // make sure the stream is empty if timing measurements are done
+
+// Apply the WellContributions, similar to StandardWell::apply()
+// y -= (C^T *(D^-1*(   B*x)))
+void WellContributions::apply(double *d_x, double *d_y)
+{
+    // apply MultisegmentWells
+
+    // make sure the stream is empty if timing measurements are done
+    cudaStreamSynchronize(stream);
+
+    if (num_ms_wells > 0) {
+        // allocate pinned memory on host if not yet done
+        if (h_x == nullptr) {
+            cudaMallocHost(&h_x, sizeof(double) * N);
+            cudaMallocHost(&h_y, sizeof(double) * N);
+            host_mem_cuda = true;
+        }
+
+        // copy vectors x and y from GPU to CPU
+        cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+        cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
        cudaStreamSynchronize(stream);

-        if (num_ms_wells > 0) {
-            // allocate pinned memory on host if not yet done
-            if (h_x == nullptr) {
-                cudaMallocHost(&h_x, sizeof(double) * N);
-                cudaMallocHost(&h_y, sizeof(double) * N);
-            }
-
-
-            // copy vectors x and y from GPU to CPU
-            cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-            cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-            cudaStreamSynchronize(stream);
-
-            // actually apply MultisegmentWells
-            for (MultisegmentWellContribution *well : multisegments) {
-                well->apply(h_x, h_y);
-            }
-
-            // copy vector y from CPU to GPU
-            cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
-            cudaStreamSynchronize(stream);
+        // actually apply MultisegmentWells
+        for (MultisegmentWellContribution *well : multisegments) {
+            well->apply(h_x, h_y);
        }

-        // apply StandardWells
-        if (num_std_wells > 0) {
-            int smem_size = 2 * sizeof(double) * dim_wells;
-            apply_well_contributions<<<num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+        // copy vector y from CPU to GPU
+        cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
+        cudaStreamSynchronize(stream);
+    }
+
+    // apply StandardWells
+    if (num_std_wells > 0) {
+        int smem_size = 2 * sizeof(double) * dim_wells;
+        apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+    }
+}
+
+#if HAVE_OPENCL
+void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
+
+    // apply MultisegmentWells
+    if (num_ms_wells > 0) {
+        // allocate pinned memory on host if not yet done
+        if (h_x == nullptr) {
+            h_x = new double[N];
+            h_y = new double[N];
        }
+
+        // copy vectors x and y from GPU to CPU
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
+        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
+
+        // actually apply MultisegmentWells
+        for (MultisegmentWellContribution *well : multisegments) {
+            well->apply(h_x, h_y);
+        }
+
+        // copy vector y from CPU to GPU
+        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
    }


-    void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
-    {
-        if (!allocated) {
-            OPM_THROW(std::logic_error,"Error cannot add wellcontribution before allocating memory in WellContributions");
-        }
+    // apply StandardWells
+    if (num_std_wells > 0) {
+        OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
+    }
+}
+#endif

-        switch (type) {
-        case MatrixType::C:
-            cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-            cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-            break;
-        case MatrixType::D:
-            cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
-            break;
-        case MatrixType::B:
-            cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-            cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-            val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-            if(num_std_wells_so_far == num_std_wells - 1){
-                val_pointers[num_std_wells] = num_blocks;
-            }
-            cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells+1), cudaMemcpyHostToDevice);
-            break;
-        default:
-            OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
-        }
-        cudaCheckLastError("WellContributions::addMatrix() failed");
-        if (MatrixType::B == type) {
-            num_blocks_so_far += val_size;
-            num_std_wells_so_far++;
-        }
+
+void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+{
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

-    void WellContributions::setCudaStream(cudaStream_t stream_)
-    {
-        this->stream = stream_;
-        for(MultisegmentWellContribution *well : multisegments){
-            well->setCudaStream(stream_);
+    switch (type) {
+    case MatrixType::C:
+        cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        break;
+    case MatrixType::D:
+        cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
+        break;
+    case MatrixType::B:
+        cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+        if (num_std_wells_so_far == num_std_wells - 1) {
+            val_pointers[num_std_wells] = num_blocks;
        }
+        cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
+        break;
+    default:
+        OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
    }
-
-    void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
-    {
-        dim = dim_;
-        dim_wells = dim_wells_;
+    cudaCheckLastError("WellContributions::addMatrix() failed");
+    if (MatrixType::B == type) {
+        num_blocks_so_far += val_size;
+        num_std_wells_so_far++;
    }
+}

-    void WellContributions::addNumBlocks(unsigned int nnz)
-    {
-        if (allocated) {
-            OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
-        }
-        num_blocks += nnz;
-        num_std_wells++;
+void WellContributions::setCudaStream(cudaStream_t stream_)
+{
+    this->stream = stream_;
+    for (MultisegmentWellContribution *well : multisegments) {
+        well->setCudaStream(stream_);
    }
+}

-    void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
+{
+    dim = dim_;
+    dim_wells = dim_wells_;
+}
+
+void WellContributions::addNumBlocks(unsigned int nnz)
+{
+    if (allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
+    }
+    num_blocks += nnz;
+    num_std_wells++;
+}
+
+void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
        unsigned int Nb, unsigned int Mb,
        unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
        std::vector<double> &Cvalues)
-    {
-        this->N = Nb * dim;
-        MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
-        multisegments.emplace_back(well);
-        ++num_ms_wells;
+{
+    this->N = Nb * dim;
+    MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
+    multisegments.emplace_back(well);
+    ++num_ms_wells;
+}
+
+
+void WellContributions::setReordering(int *toOrder, bool reorder)
+{
+    this->toOrder = toOrder;
+    this->reorder = reorder;
+    for (auto& ms : multisegments) {
+        ms->setReordering(toOrder, reorder);
    }
+}
+
+#if HAVE_OPENCL
+void WellContributions::setOpenCLQueue(cl::CommandQueue *queue)
+{
+    this->queue = queue;
+}
+#endif

 } //namespace Opm

--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -22,137 +22,167 @@

 #include <config.h>

-#if ! HAVE_CUDA
-  #error "This file should only be included if CUDA is found"
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+#if HAVE_OPENCL
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
 #endif

 #include <vector>

 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>

-#include <cuda_runtime.h>

 namespace Opm
 {

-    /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
-    /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
-    /// So far, StandardWell and MultisegmentWell are supported
-    /// A single instance (or pointer) of this class is passed to the cusparseSolver.
-    /// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.
+/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
+/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
+/// So far, StandardWell and MultisegmentWell are supported
+/// A single instance (or pointer) of this class is passed to the cusparseSolver.
+/// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.

-    /// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
-    /// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
-    /// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
-    /// B*x and D*B*x are a vector with numStaticWellEq doubles
-    /// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
-    ///
-    /// This class is used in 3 phases:
-    /// - get total size of all wellcontributions that must be stored here
-    /// - allocate memory
-    /// - copy data of wellcontributions
-    class WellContributions
-    {
+/// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
+/// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
+/// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
+/// B*x and D*B*x are a vector with numStaticWellEq doubles
+/// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
+///
+/// This class is used in 3 phases:
+/// - get total size of all wellcontributions that must be stored here
+/// - allocate memory
+/// - copy data of wellcontributions
+class WellContributions
+{

-    private:
-        unsigned int num_blocks = 0;             // total number of blocks in all wells
-        unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
-        unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
-        unsigned int num_std_wells = 0;          // number of StandardWells in this object
-        unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
-        unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
-        unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
-        unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
-        unsigned int N;                          // number of rows (not blockrows) in vectors x and y
-        bool allocated = false;
-        std::vector<MultisegmentWellContribution*> multisegments;
-        cudaStream_t stream;
+private:
+    unsigned int num_blocks = 0;             // total number of blocks in all wells
+    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
+    unsigned int dim_wells;                  // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
+    unsigned int num_std_wells = 0;          // number of StandardWells in this object
+    unsigned int num_ms_wells = 0;           // number of MultisegmentWells in this object, must equal multisegments.size()
+    unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
+    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
+    unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
+    unsigned int N;                          // number of rows (not blockrows) in vectors x and y
+    bool allocated = false;
+    std::vector<MultisegmentWellContribution*> multisegments;

-        // data for StandardWells, could remain nullptrs if not used
-        double *d_Cnnzs = nullptr;
-        double *d_Dnnzs = nullptr;
-        double *d_Bnnzs = nullptr;
-        int *d_Ccols = nullptr;
-        int *d_Bcols = nullptr;
-        double *d_z1 = nullptr;
-        double *d_z2 = nullptr;
-        unsigned int *d_val_pointers = nullptr;
+#if HAVE_CUDA
+    cudaStream_t stream;
+#endif

-        double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+#if HAVE_OPENCL
+    cl::CommandQueue *queue = nullptr;
+#endif

+    // data for StandardWells, could remain nullptrs if not used
+    double *d_Cnnzs = nullptr;
+    double *d_Dnnzs = nullptr;
+    double *d_Bnnzs = nullptr;
+    int *d_Ccols = nullptr;
+    int *d_Bcols = nullptr;
+    double *d_z1 = nullptr;
+    double *d_z2 = nullptr;
+    unsigned int *d_val_pointers = nullptr;

-    public:
+    double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+    bool host_mem_cuda = false;             // true iff h_x and h_y are allocated by cudaMallocHost(), so they need to be freed using cudaFreeHost()

-        /// StandardWell has C, D and B matrices that need to be copied
-        enum class MatrixType {
-            C,
-            D,
-            B
-        };
+    int *toOrder = nullptr;
+    bool reorder = false;

-        /// Set a cudaStream to be used
-        /// \param[in] stream           the cudaStream that is used to launch the kernel in
-        void setCudaStream(cudaStream_t stream);
+public:

-        /// Create a new WellContributions, implementation is empty
-        WellContributions(){};
-
-        /// Destroy a WellContributions, and free memory
-        ~WellContributions();
-
-        /// Apply all Wells in this object
-        /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
-        /// \param[in] d_x        vector x, must be on GPU
-        /// \param[inout] d_y     vector y, must be on GPU
-        void apply(double *d_x, double *d_y);
-
-        /// Allocate memory for the StandardWells
-        void alloc();
-
-        /// Indicate how large the blocks of the StandardWell (C and B) are
-        /// \param[in] dim         number of columns
-        /// \param[in] dim_wells   number of rows
-        void setBlockSize(unsigned int dim, unsigned int dim_wells);
-
-        /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
-        /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
-        void addNumBlocks(unsigned int numBlocks);
-
-        /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
-        /// \param[in] type        indicate if C, D or B is sent
-        /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
-        /// \param[in] values      array of nonzeroes
-        /// \param[in] val_size    number of blocks in C or B, ignored for D
-        void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
-
-        /// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
-        /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
-        /// \param[in] dim              size of blocks in vectors x and y, equal to MultisegmentWell::numEq
-        /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
-        /// \param[in] Nb               number of blocks in vectors x and y
-        /// \param[in] Mb               number of blockrows in C, B and D
-        /// \param[in] BnumBlocks       number of blocks in C and B
-        /// \param[in] Bvalues          nonzero values of matrix B
-        /// \param[in] BcolIndices      columnindices of blocks of matrix B
-        /// \param[in] BrowPointers     rowpointers of matrix B
-        /// \param[in] DnumBlocks       number of blocks in D
-        /// \param[in] Dvalues          nonzero values of matrix D
-        /// \param[in] DcolPointers     columnpointers of matrix D
-        /// \param[in] DrowIndices      rowindices of matrix D
-        /// \param[in] Cvalues          nonzero values of matrix C
-        void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
-            unsigned int Nb, unsigned int Mb,
-            unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-            unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
-            std::vector<double> &Cvalues);
-
-        /// Return the number of wells added to this object
-        /// \return the number of wells added to this object
-        unsigned int getNumWells(){
-            return num_std_wells + num_ms_wells;
-        }
+    /// StandardWell has C, D and B matrices that need to be copied
+    enum class MatrixType {
+        C,
+        D,
+        B
    };

+    /// Set a cudaStream to be used
+    /// \param[in] stream           the cudaStream that is used to launch the kernel in
+    void setCudaStream(cudaStream_t stream);
+
+    /// Create a new WellContributions, implementation is empty
+    WellContributions() {};
+
+    /// Destroy a WellContributions, and free memory
+    ~WellContributions();
+
+    /// Apply all Wells in this object
+    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
+    /// \param[in] d_x        vector x, must be on GPU
+    /// \param[inout] d_y     vector y, must be on GPU
+    void apply(double *d_x, double *d_y);
+#if HAVE_OPENCL
+    void apply(cl::Buffer& x, cl::Buffer& y);
+#endif
+
+    /// Allocate memory for the StandardWells
+    void alloc();
+
+    /// Indicate how large the blocks of the StandardWell (C and B) are
+    /// \param[in] dim         number of columns
+    /// \param[in] dim_wells   number of rows
+    void setBlockSize(unsigned int dim, unsigned int dim_wells);
+
+    /// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
+    /// \param[in] numBlocks   number of blocks in C and B of next StandardWell
+    void addNumBlocks(unsigned int numBlocks);
+
+    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
+    /// \param[in] type        indicate if C, D or B is sent
+    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
+    /// \param[in] values      array of nonzeroes
+    /// \param[in] val_size    number of blocks in C or B, ignored for D
+    void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
+
+    /// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
+    /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
+    /// \param[in] dim              size of blocks in vectors x and y, equal to MultisegmentWell::numEq
+    /// \param[in] dim_wells        size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
+    /// \param[in] Nb               number of blocks in vectors x and y
+    /// \param[in] Mb               number of blockrows in C, B and D
+    /// \param[in] BnumBlocks       number of blocks in C and B
+    /// \param[in] Bvalues          nonzero values of matrix B
+    /// \param[in] BcolIndices      columnindices of blocks of matrix B
+    /// \param[in] BrowPointers     rowpointers of matrix B
+    /// \param[in] DnumBlocks       number of blocks in D
+    /// \param[in] Dvalues          nonzero values of matrix D
+    /// \param[in] DcolPointers     columnpointers of matrix D
+    /// \param[in] DrowIndices      rowindices of matrix D
+    /// \param[in] Cvalues          nonzero values of matrix C
+    void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+                                         unsigned int Nb, unsigned int Mb,
+                                         unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+                                         unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+                                         std::vector<double> &Cvalues);
+
+    /// Return the number of wells added to this object
+    /// \return the number of wells added to this object
+    unsigned int getNumWells() {
+        return num_std_wells + num_ms_wells;
+    }
+
+
+    /// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
+    /// Those indices need to be mapped via toOrder
+    /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether the columnindices need to be reordered or not
+    void setReordering(int *toOrder, bool reorder);
+
+#if HAVE_OPENCL
+    /// This object copies some cl::Buffers, it requires a CommandQueue to do that
+    /// \param[in] queue      the opencl commandqueue to be used
+    void setOpenCLQueue(cl::CommandQueue *queue);
+#endif
+};
+
 } //namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -18,7 +18,7 @@
 */

 #ifndef __NVCC__
-    #error "Cannot compile for cusparse: NVIDIA compiler not found"
+#error "Cannot compile for cusparse: NVIDIA compiler not found"
 #endif

 #include <cstdio>
@ -42,488 +42,483 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-namespace Opm
+namespace bda
 {

-    const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-    const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+using Opm::OpmLog;

-    double second(void) {
-        struct timeval tv;
-        gettimeofday(&tv, nullptr);
-        return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+
+cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance) {}
+
+cusparseSolverBackend::~cusparseSolverBackend() {
+    finalize();
+}
+
+void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+    double t_total1, t_total2;
+    int n = N;
+    double rho = 1.0, rhop;
+    double alpha, nalpha, beta;
+    double omega, nomega, tmp1, tmp2;
+    double norm, norm_0;
+    double zero = 0.0;
+    double one  = 1.0;
+    double mone = -1.0;
+    float it;
+
+    t_total1 = second();
+
+    if (wellContribs.getNumWells() > 0) {
+        wellContribs.setCudaStream(stream);
    }

-    cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0) {
+    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
+
+    cublasDscal(cublasHandle, n, &mone, d_r, 1);
+    cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
+    cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
+    cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
+    cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+
+    if (verbosity > 1) {
+        std::ostringstream out;
+        out << std::scientific << "cusparseSolver initial norm: " << norm_0;
+        OpmLog::info(out.str());
    }

-    cusparseSolverBackend::~cusparseSolverBackend() {
-        finalize();
-    }
+    for (it = 0.5; it < maxit; it += 0.5) {
+        rhop = rho;
+        cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);

-    void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
-        double t_total1, t_total2;
-        int n = N;
-        double rho = 1.0, rhop;
-        double alpha, nalpha, beta;
-        double omega, nomega, tmp1, tmp2;
-        double norm, norm_0;
-        double zero = 0.0;
-        double one  = 1.0;
-        double mone = -1.0;
-        float it;
-
-        t_total1 = second();
-
-        if(wellContribs.getNumWells() > 0){
-            wellContribs.setCudaStream(stream);
+        if (it > 1) {
+            beta = (rho / rhop) * (alpha / omega);
+            nomega = -omega;
+            cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
+            cublasDscal(cublasHandle, n, &beta, d_p, 1);
+            cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
        }

-        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
+        // apply ilu0
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);

-        cublasDscal(cublasHandle, n, &mone, d_r, 1);
-        cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
-        cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
-        cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); 
-        cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+        // spmv
+        cusparseDbsrmv(cusparseHandle, order, \
+                       operation, Nb, Nb, nnzb, \
+                       &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            wellContribs.apply(d_pw, d_v);
+        }
+
+        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
+        alpha = rho / tmp1;
+        nalpha = -alpha;
+        cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
+        cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
+        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }
+
+        it += 0.5;
+
+        // apply ilu0
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
+        cusparseDbsrsv2_solve(cusparseHandle, order, \
+                              operation, Nb, nnzb, &one, \
+                              descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
+
+        // spmv
+        cusparseDbsrmv(cusparseHandle, order, \
+                       operation, Nb, Nb, nnzb, &one, descr_M, \
+                       d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            wellContribs.apply(d_s, d_t);
+        }
+
+        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
+        cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
+        omega = tmp1 / tmp2;
+        nomega = -omega;
+        cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
+        cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
+
+        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }

        if (verbosity > 1) {
            std::ostringstream out;
-            out << std::scientific << "cusparseSolver initial norm: " << norm_0;
-            OpmLog::info(out.str());
-        }
-
-        for (it = 0.5; it < maxit; it+=0.5) {
-            rhop = rho;
-            cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
-
-            if (it > 1) {
-                beta = (rho/rhop) * (alpha/omega);
-                nomega = -omega;
-                cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
-                cublasDscal(cublasHandle, n, &beta, d_p, 1);
-                cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
-            }
-
-            // apply ilu0
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
-
-            // spmv
-            cusparseDbsrmv(cusparseHandle, order, \
-                operation, Nb, Nb, nnzb, \
-                &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
-
-            // apply wellContributions
-            if(wellContribs.getNumWells() > 0){
-                wellContribs.apply(d_pw, d_v);
-            }
-
-            cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
-            alpha = rho / tmp1;
-            nalpha = -alpha;
-            cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
-            cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
-            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
-
-            if (norm < tolerance * norm_0 && it > minit) {
-                break;
-            }
-
-            it += 0.5;
-
-            // apply ilu0
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
-            cusparseDbsrsv2_solve(cusparseHandle, order, \
-                operation, Nb, nnzb, &one, \
-                descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
-
-            // spmv
-            cusparseDbsrmv(cusparseHandle, order, \
-                operation, Nb, Nb, nnzb, &one, descr_M, \
-                d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
-
-            // apply wellContributions
-            if(wellContribs.getNumWells() > 0){
-                wellContribs.apply(d_s, d_t);
-            }
-
-            cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
-            cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
-            omega = tmp1 / tmp2;
-            nomega = -omega;
-            cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
-            cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
-
-            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
-
-
-            if (norm < tolerance * norm_0 && it > minit) {
-                break;
-            }
-
-            if (verbosity > 1) {
-                std::ostringstream out;
-                out << "it: " << it << std::scientific << ", norm: " << norm;
-                OpmLog::info(out.str());
-            }
-        }
-
-        t_total2 = second();
-
-        res.iterations = std::min(it, (float)maxit);
-        res.reduction = norm/norm_0;
-        res.conv_rate  = static_cast<double>(pow(res.reduction,1.0/it));
-        res.elapsed = t_total2 - t_total1;
-        res.converged = (it != (maxit + 0.5));
-
-        if (verbosity > 0) {
-            std::ostringstream out;
-            out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-                   ", time per iteration: " << res.elapsed/it << ", iterations: " << it;
+            out << "it: " << it << std::scientific << ", norm: " << norm;
            OpmLog::info(out.str());
        }
    }

+    t_total2 = second();

-    void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
-        this->N = N;
-        this->nnz = nnz;
-        this->block_size = dim;
-        this->nnzb = nnz/block_size/block_size;
-        Nb = (N + dim - 1) / dim;
+    res.iterations = std::min(it, (float)maxit);
+    res.reduction = norm / norm_0;
+    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / it));
+    res.elapsed = t_total2 - t_total1;
+    res.converged = (it != (maxit + 0.5));
+
+    if (verbosity > 0) {
        std::ostringstream out;
-        out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
-        OpmLog::info(out.str());
-        out.str("");
-        out.clear();
-        out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
+    }
+}

-        int deviceID = 0;
-        cudaSetDevice(deviceID);
-        cudaCheckLastError("Could not get device");
-        struct cudaDeviceProp props;
-        cudaGetDeviceProperties(&props, deviceID);
-        cudaCheckLastError("Could not get device properties");
-        out.str("");
-        out.clear();
-        out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
-        OpmLog::info(out.str());

-        cudaStreamCreate(&stream);
-        cudaCheckLastError("Could not create stream");
+void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
+    this->N = N;
+    this->nnz = nnz;
+    this->block_size = dim;
+    this->nnzb = nnz / block_size / block_size;
+    Nb = (N + dim - 1) / dim;
+    std::ostringstream out;
+    out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
+    OpmLog::info(out.str());
+    out.str("");
+    out.clear();
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+    OpmLog::info(out.str());

-        cublasCreate(&cublasHandle);
-        cudaCheckLastError("Could not create cublasHandle");
+    int deviceID = 0;
+    cudaSetDevice(deviceID);
+    cudaCheckLastError("Could not get device");
+    struct cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, deviceID);
+    cudaCheckLastError("Could not get device properties");
+    out.str("");
+    out.clear();
+    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+    OpmLog::info(out.str());

-        cusparseCreate(&cusparseHandle);
-        cudaCheckLastError("Could not create cusparseHandle");
+    cudaStreamCreate(&stream);
+    cudaCheckLastError("Could not create stream");

-        cudaMalloc((void**)&d_x, sizeof(double) * N);
-        cudaMalloc((void**)&d_b, sizeof(double) * N);
-        cudaMalloc((void**)&d_r, sizeof(double) * N);
-        cudaMalloc((void**)&d_rw,sizeof(double) * N);
-        cudaMalloc((void**)&d_p, sizeof(double) * N);
-        cudaMalloc((void**)&d_pw,sizeof(double) * N);
-        cudaMalloc((void**)&d_s, sizeof(double) * N);
-        cudaMalloc((void**)&d_t, sizeof(double) * N);
-        cudaMalloc((void**)&d_v, sizeof(double) * N);
-        cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
-        cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
-        cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
-        cudaCheckLastError("Could not allocate enough memory on GPU");
+    cublasCreate(&cublasHandle);
+    cudaCheckLastError("Could not create cublasHandle");

-        cublasSetStream(cublasHandle, stream);
-        cudaCheckLastError("Could not set stream to cublas");
-        cusparseSetStream(cusparseHandle, stream);
-        cudaCheckLastError("Could not set stream to cusparse");
+    cusparseCreate(&cusparseHandle);
+    cudaCheckLastError("Could not create cusparseHandle");
+
+    cudaMalloc((void**)&d_x, sizeof(double) * N);
+    cudaMalloc((void**)&d_b, sizeof(double) * N);
+    cudaMalloc((void**)&d_r, sizeof(double) * N);
+    cudaMalloc((void**)&d_rw, sizeof(double) * N);
+    cudaMalloc((void**)&d_p, sizeof(double) * N);
+    cudaMalloc((void**)&d_pw, sizeof(double) * N);
+    cudaMalloc((void**)&d_s, sizeof(double) * N);
+    cudaMalloc((void**)&d_t, sizeof(double) * N);
+    cudaMalloc((void**)&d_v, sizeof(double) * N);
+    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb + 1));
+    cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+    cudaCheckLastError("Could not allocate enough memory on GPU");
+
+    cublasSetStream(cublasHandle, stream);
+    cudaCheckLastError("Could not set stream to cublas");
+    cusparseSetStream(cusparseHandle, stream);
+    cudaCheckLastError("Could not set stream to cusparse");

 #if COPY_ROW_BY_ROW
-        cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
-        cudaCheckLastError("Could not allocate pinned memory");
+    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+    cudaCheckLastError("Could not allocate pinned memory");
 #endif

-        initialized = true;
-    } // end initialize()
+    initialized = true;
+} // end initialize()

-    void cusparseSolverBackend::finalize() {
-        if (initialized) {
-            cudaFree(d_x);
-            cudaFree(d_b);
-            cudaFree(d_r);
-            cudaFree(d_rw);
-            cudaFree(d_p);
-            cudaFree(d_pw);
-            cudaFree(d_s);
-            cudaFree(d_t);
-            cudaFree(d_v);
-            cudaFree(d_mVals);
-            cudaFree(d_bVals);
-            cudaFree(d_bCols);
-            cudaFree(d_bRows);
-            cudaFree(d_buffer);
-            cusparseDestroyBsrilu02Info(info_M);
-            cusparseDestroyBsrsv2Info(info_L);
-            cusparseDestroyBsrsv2Info(info_U);
-            cusparseDestroyMatDescr(descr_B);
-            cusparseDestroyMatDescr(descr_M);
-            cusparseDestroyMatDescr(descr_L);
-            cusparseDestroyMatDescr(descr_U);
-            cusparseDestroy(cusparseHandle);
-            cublasDestroy(cublasHandle);
+void cusparseSolverBackend::finalize() {
+    if (initialized) {
+        cudaFree(d_x);
+        cudaFree(d_b);
+        cudaFree(d_r);
+        cudaFree(d_rw);
+        cudaFree(d_p);
+        cudaFree(d_pw);
+        cudaFree(d_s);
+        cudaFree(d_t);
+        cudaFree(d_v);
+        cudaFree(d_mVals);
+        cudaFree(d_bVals);
+        cudaFree(d_bCols);
+        cudaFree(d_bRows);
+        cudaFree(d_buffer);
+        cusparseDestroyBsrilu02Info(info_M);
+        cusparseDestroyBsrsv2Info(info_L);
+        cusparseDestroyBsrsv2Info(info_U);
+        cusparseDestroyMatDescr(descr_B);
+        cusparseDestroyMatDescr(descr_M);
+        cusparseDestroyMatDescr(descr_L);
+        cusparseDestroyMatDescr(descr_U);
+        cusparseDestroy(cusparseHandle);
+        cublasDestroy(cublasHandle);
 #if COPY_ROW_BY_ROW
-            cudaFreeHost(vals_contiguous);
+        cudaFreeHost(vals_contiguous);
 #endif
-            cudaStreamDestroy(stream);
-        }
-    } // end finalize()
+        cudaStreamDestroy(stream);
+    }
+} // end finalize()


-    void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
+void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {

-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-#if COPY_ROW_BY_ROW
-        int sum = 0;
-        for(int i = 0; i < Nb; ++i){
-            int size_row = rows[i+1] - rows[i];
-            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
-            sum += size_row * block_size * block_size;
-        }
-        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#else
-        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#endif
-
-        cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-    } // end copy_system_to_gpu()
-
-
-    // don't copy rowpointers and colindices, they stay the same
-    void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-#if COPY_ROW_BY_ROW
-        int sum = 0;
-        for(int i = 0; i < Nb; ++i){
-            int size_row = rows[i+1] - rows[i];
-            memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
-            sum += size_row * block_size * block_size;
-        }
-        cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#else
-        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-#endif
-
-        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-    } // end update_system_on_gpu()
-
-
-    void cusparseSolverBackend::reset_prec_on_gpu() {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    double t1, t2;
+    if (verbosity > 2) {
+        t1 = second();
    }

-
-    bool cusparseSolverBackend::analyse_matrix() {
-
-        int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
-        double t1, t2;
-
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-        cusparseCreateMatDescr(&descr_B);
-        cusparseCreateMatDescr(&descr_M);
-        cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
-        const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;     // matrices from Flow are base0
-
-        cusparseSetMatIndexBase(descr_B, base_type);
-        cusparseSetMatIndexBase(descr_M, base_type);
-
-        cusparseCreateMatDescr(&descr_L);
-        cusparseSetMatIndexBase(descr_L, base_type);
-        cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
-        cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
-
-        cusparseCreateMatDescr(&descr_U);
-        cusparseSetMatIndexBase(descr_U, base_type);
-        cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
-        cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
-        cudaCheckLastError("Could not initialize matrix descriptions");
-
-        cusparseCreateBsrilu02Info(&info_M);
-        cusparseCreateBsrsv2Info(&info_L);
-        cusparseCreateBsrsv2Info(&info_U);
-        cudaCheckLastError("Could not create analysis info");
-
-        cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-            descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
-        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
-        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-            descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
-        cudaCheckLastError();
-        d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
-        
-        cudaMalloc((void**)&d_buffer, d_bufferSize);
-
-        // analysis of ilu LU decomposition
-        cusparseDbsrilu02_analysis(cusparseHandle, order, \
-            Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
-            block_size, info_M, policy, d_buffer);
-
-        int structural_zero;
-        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
-        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
-            return false;
-        }
-
-        // analysis of ilu apply
-        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-            Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
-            block_size, info_L, policy, d_buffer);
-
-        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-            Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
-            block_size, info_U, policy, d_buffer);
-        cudaCheckLastError("Could not analyse level information");
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-
-        analysis_done = true;
-
-        return true;
-    } // end analyse_matrix()
-
-    bool cusparseSolverBackend::create_preconditioner() {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-        d_mCols = d_bCols;
-        d_mRows = d_bRows;
-        cusparseDbsrilu02(cusparseHandle, order, \
-            Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
-            block_size, info_M, policy, d_buffer);
-        cudaCheckLastError("Could not perform ilu decomposition");
-
-        int structural_zero;
-        // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
-        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
-        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
-            return false;
-        }
-
-        if (verbosity > 2) {
-            cudaStreamSynchronize(stream);
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-        return true;
-    } // end create_preconditioner()
-
-
-    void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
-        // actually solve
-        gpu_pbicgstab(wellContribs, res);
-        cudaStreamSynchronize(stream);
-        cudaCheckLastError("Something went wrong during the GPU solve");
-    } // end solve_system()
-
-
-    // copy result to host memory
-    // caller must be sure that x is a valid array
-    void cusparseSolverBackend::post_process(double *x) {
-
-        double t1, t2;
-        if (verbosity > 2) {
-            t1 = second();
-        }
-
-        cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
-
-        if (verbosity > 2) {
-            t2 = second();
-            std::ostringstream out;
-            out << "cusparseSolver::post_process(): " << t2-t1 << " s";
-            OpmLog::info(out.str());
-        }
-    } // end post_process()
-
-
-    typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
-
-    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { 
-        if (initialized == false) {
-            initialize(N, nnz, dim);
-            copy_system_to_gpu(vals, rows, cols, b);
-        }else{
-            update_system_on_gpu(vals, rows, b);
-        }
-        if (analysis_done == false) {
-            if (!analyse_matrix()) {
-                return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
-            }
-        }
-        reset_prec_on_gpu();
-        if (create_preconditioner()) {
-            solve_system(wellContribs, res);
-        }else{
-            return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
-        }
-        return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rows[i + 1] - rows[i];
+        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
    }
+    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#else
+    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#endif
+
+    cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        t2 = second();
+        std::ostringstream out;
+        out << "cusparseSolver::copy_system_to_gpu(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end copy_system_to_gpu()
+
+
+// don't copy rowpointers and colindices, they stay the same
+void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
+
+    double t1, t2;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rows[i + 1] - rows[i];
+        memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#else
+    cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+#endif
+
+    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        t2 = second();
+        std::ostringstream out;
+        out << "cusparseSolver::update_system_on_gpu(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system_on_gpu()
+
+
+void cusparseSolverBackend::reset_prec_on_gpu() {
+    cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+}
+
+
+bool cusparseSolverBackend::analyse_matrix() {
+
+    int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
+    double t1, t2;
+
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    cusparseCreateMatDescr(&descr_B);
+    cusparseCreateMatDescr(&descr_M);
+    cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
+    const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;     // matrices from Flow are base0
+
+    cusparseSetMatIndexBase(descr_B, base_type);
+    cusparseSetMatIndexBase(descr_M, base_type);
+
+    cusparseCreateMatDescr(&descr_L);
+    cusparseSetMatIndexBase(descr_L, base_type);
+    cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+    cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
+
+    cusparseCreateMatDescr(&descr_U);
+    cusparseSetMatIndexBase(descr_U, base_type);
+    cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
+    cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
+    cudaCheckLastError("Could not initialize matrix descriptions");
+
+    cusparseCreateBsrilu02Info(&info_M);
+    cusparseCreateBsrsv2Info(&info_L);
+    cusparseCreateBsrsv2Info(&info_U);
+    cudaCheckLastError("Could not create analysis info");
+
+    cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
+                                 descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+                               descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
+    cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+                               descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
+    cudaCheckLastError();
+    d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+
+    cudaMalloc((void**)&d_buffer, d_bufferSize);
+
+    // analysis of ilu LU decomposition
+    cusparseDbsrilu02_analysis(cusparseHandle, order, \
+                               Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+                               block_size, info_M, policy, d_buffer);
+
+    int structural_zero;
+    cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+    if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+        return false;
+    }
+
+    // analysis of ilu apply
+    cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+                             Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+                             block_size, info_L, policy, d_buffer);
+
+    cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+                             Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+                             block_size, info_U, policy, d_buffer);
+    cudaCheckLastError("Could not analyse level information");
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        t2 = second();
+        std::ostringstream out;
+        out << "cusparseSolver::analyse_matrix(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+
+    analysis_done = true;
+
+    return true;
+} // end analyse_matrix()
+
+bool cusparseSolverBackend::create_preconditioner() {
+
+    double t1, t2;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    d_mCols = d_bCols;
+    d_mRows = d_bRows;
+    cusparseDbsrilu02(cusparseHandle, order, \
+                      Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+                      block_size, info_M, policy, d_buffer);
+    cudaCheckLastError("Could not perform ilu decomposition");
+
+    int structural_zero;
+    // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
+    cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+    if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+        return false;
+    }
+
+    if (verbosity > 2) {
+        cudaStreamSynchronize(stream);
+        t2 = second();
+        std::ostringstream out;
+        out << "cusparseSolver::create_preconditioner(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+    return true;
+} // end create_preconditioner()
+
+
+void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
+    // actually solve
+    gpu_pbicgstab(wellContribs, res);
+    cudaStreamSynchronize(stream);
+    cudaCheckLastError("Something went wrong during the GPU solve");
+} // end solve_system()
+
+
+// copy result to host memory
+// caller must be sure that x is a valid array
+void cusparseSolverBackend::get_result(double *x) {
+
+    double t1, t2;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "cusparseSolver::post_process(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end post_process()
+
+
+typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
+
+BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+    if (initialized == false) {
+        initialize(N, nnz, dim);
+        copy_system_to_gpu(vals, rows, cols, b);
+    } else {
+        update_system_on_gpu(vals, rows, b);
+    }
+    if (analysis_done == false) {
+        if (!analyse_matrix()) {
+            return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+        }
+    }
+    reset_prec_on_gpu();
+    if (create_preconditioner()) {
+        solve_system(wellContribs, res);
+    } else {
+        return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+    }
+    return BdaSolverStatus::BDA_SOLVER_SUCCESS;
+}


 }
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@ -24,21 +24,18 @@
 #include "cublas_v2.h"
 #include "cusparse_v2.h"

-#include "opm/simulators/linalg/bda/BdaResult.hpp"
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-namespace Opm
+namespace bda
 {

 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-class cusparseSolverBackend{
+class cusparseSolverBackend : public BdaSolver {

 private:

-    int minit;
-    int maxit;
-    double tolerance;
-
    cublasHandle_t cublasHandle;
    cusparseHandle_t cusparseHandle;
    cudaStream_t stream;
@ -49,24 +46,13 @@ private:
    double *d_bVals, *d_mVals;
    int *d_bCols, *d_mCols;
    int *d_bRows, *d_mRows;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;
+    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
    double *d_pw, *d_s, *d_t, *d_v;
    void *d_buffer;
-    int N, Nb, nnz, nnzb;
-    double *vals_contiguous;
+    double *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp

-    int block_size;
-
-    bool initialized = false;
    bool analysis_done = false;

-    // verbosity
-    // 0: print nothing during solves, only when initializing
-    // 1: print number of iterations and final norm
-    // 2: also print norm each iteration
-    // 3: also print timings of different backend functions
-
-    int verbosity = 0;

    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
@ -113,12 +99,6 @@ private:

 public:

-    enum class cusparseSolverStatus {
-        CUSPARSE_SOLVER_SUCCESS,
-        CUSPARSE_SOLVER_ANALYSIS_FAILED,
-        CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
-        CUSPARSE_SOLVER_UNKNOWN_ERROR
-    };

    /// Construct a cusparseSolver
    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
@ -140,15 +120,15 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
+    BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;

-    /// Post processing after linear solve, now only copies resulting x vector back
+    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
-    void post_process(double *x);
+    void get_result(double *x) override;

 }; // end class cusparseSolverBackend

-}
+} // namespace bda

 #endif

--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@ -0,0 +1,401 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPENCL_HPP
+#define OPENCL_HPP
+
+namespace bda
+{
+
+    const char* kernel_1  = R"(
+    __kernel void hello1(void)
+    {
+        printf("in the kernel 1\n");
+    }
+    )";
+
+    const char* kernel_2  = R"(
+    __kernel void hello2(
+        __global int *in1,
+        __global int *in2,
+        __global int *out)
+    {
+        printf("in the kernel 2\n");
+        int idx = get_global_id(0);
+        out[idx] = in1[idx];
+        for(int i = 0; i < 20000000; ++i){
+            out[idx] = in1[idx] - in2[idx] * out[idx];
+            if(out[idx] > 100000){
+                out[idx] -= 100000;
+            }
+            if(out[idx] < -100000){
+                out[idx] += 100000;
+            }
+        }
+    }
+    )";
+
+    const char* axpy_s  = R"(
+    __kernel void axpy(
+        __global double *in,
+        const double a,
+        __global double *out,
+        const int N)
+    {
+        unsigned int NUM_THREADS = get_global_size(0);
+        int idx = get_global_id(0);
+
+        while(idx < N){
+            out[idx] += a * in[idx];
+            idx += NUM_THREADS;
+        }
+    }
+    )";
+
+
+    // returns partial sums, instead of the final dot product
+    const char* dot_1_s = R"(
+    __kernel void dot_1(
+        __global double *in1,
+        __global double *in2,
+        __global double *out,
+        const unsigned int N,
+        __local double *tmp)
+    {
+        unsigned int tid = get_local_id(0);
+        unsigned int bsize = get_local_size(0);
+        unsigned int bid = get_global_id(0) / bsize;
+        unsigned int i = get_global_id(0);
+        unsigned int NUM_THREADS = get_global_size(0);
+
+        double sum = 0.0;
+        while(i < N){
+            sum += in1[i] * in2[i];
+            i += NUM_THREADS;
+        }
+        tmp[tid] = sum;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // do reduction in shared mem
+        for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+        {
+            if (tid < s)
+            {
+                tmp[tid] += tmp[tid + s];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        // write result for this block to global mem
+        if (tid == 0) out[get_group_id(0)] = tmp[0];
+    }
+    )";
+
+
+    // returns partial sums, instead of the final norm
+    // the square root must be computed on CPU
+    const char* norm_s = R"(
+    __kernel void norm(
+        __global double *in,
+        __global double *out,
+        const unsigned int N,
+        __local double *tmp)
+    {
+        unsigned int tid = get_local_id(0);
+        unsigned int bsize = get_local_size(0);
+        unsigned int bid = get_global_id(0) / bsize;
+        unsigned int i = get_global_id(0);
+        unsigned int NUM_THREADS = get_global_size(0);
+
+        double local_sum = 0.0;
+        while(i < N){
+            local_sum += in[i] * in[i];
+            i += NUM_THREADS;
+        }
+        tmp[tid] = local_sum;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // do reduction in shared mem
+        for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+        {
+            if (tid < s)
+            {
+                tmp[tid] += tmp[tid + s];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        // write result for this block to global mem
+        if (tid == 0) out[get_group_id(0)] = tmp[0];
+    }
+    )";
+
+
+    // p = (p - omega * v) * beta + r
+    const char* custom_s = R"(
+    __kernel void custom(
+        __global double *p,
+        __global double *v,
+        __global double *r,
+        const double omega,
+        const double beta,
+        const int N)
+    {
+        const unsigned int NUM_THREADS = get_global_size(0);
+        unsigned int idx = get_global_id(0);
+
+        while(idx < N){
+            double res = p[idx];
+            res -= omega * v[idx];
+            res *= beta;
+            res += r[idx];
+            p[idx] = res;
+            idx += NUM_THREADS;
+        }
+    }
+    )";
+
+
+    // b = mat * x
+    // algorithm based on:
+    // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+    // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+    const char* spmv_blocked_s = R"(
+    __kernel void spmv_blocked(
+        __global const double *vals,
+        __global const int *cols,
+        __global const int *rows,
+        const int Nb,
+        __global const double *x,
+        __global double *b,
+        const unsigned int block_size,
+        __local double *tmp)
+    {
+        const unsigned int warpsize = 32;
+        const unsigned int bsize = get_local_size(0);
+        const unsigned int idx_b = get_global_id(0) / bsize;
+        const unsigned int idx_t = get_local_id(0);
+        unsigned int idx = idx_b * bsize + idx_t;
+        const unsigned int bs = block_size;
+        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+        const unsigned int NUM_THREADS = get_global_size(0);
+        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+        unsigned int target_block_row = idx / warpsize;
+        const unsigned int lane = idx_t % warpsize;
+        const unsigned int c = (lane / bs) % bs;
+        const unsigned int r = lane % bs;
+
+        // for 3x3 blocks:
+        // num_active_threads: 27
+        // num_blocks_per_warp: 3
+
+        while(target_block_row < Nb){
+            unsigned int first_block = rows[target_block_row];
+            unsigned int last_block = rows[target_block_row+1];
+            unsigned int block = first_block + lane / (bs*bs);
+            double local_out = 0.0;
+
+            if(lane < num_active_threads){
+                for(; block < last_block; block += num_blocks_per_warp){
+                    double x_elem = x[cols[block]*bs + c];
+                    double A_elem = vals[block*bs*bs + c + r*bs];
+                    local_out += x_elem * A_elem;
+                }
+            }
+
+            // do reduction in shared mem
+            tmp[lane] = local_out;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+            {
+                if (lane + offset < warpsize)
+                {
+                    tmp[lane] += tmp[lane + offset];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if(lane < bs){
+                unsigned int row = target_block_row*bs + lane;
+                b[row] = tmp[lane];
+            }
+            target_block_row += num_warps_in_grid;
+        }
+    }
+    )";
+
+
+
+    // ILU apply part 1: forward substitution
+    // solves L*x=y where L is a lower triangular sparse blocked matrix
+    const char* ILU_apply1_s = R"(
+    __kernel void ILU_apply1(
+        __global const double *Lvals,
+        __global const unsigned int *Lcols,
+        __global const unsigned int *Lrows,
+        const unsigned int LrowSize,
+        __global const double *y,
+        __global double *x,
+        __global const unsigned int *nodesPerColorPrefix,
+        const unsigned int color,
+        const unsigned int block_size,
+        __local double *tmp)
+    {
+        const unsigned int warpsize = 32;
+        const unsigned int bs = block_size;
+        const unsigned int idx_t = get_local_id(0);
+        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+        const unsigned int NUM_THREADS = get_global_size(0);
+        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+        unsigned int idx = get_global_id(0);
+        unsigned int target_block_row = idx / warpsize;
+        idx += nodesPerColorPrefix[color];
+        target_block_row += nodesPerColorPrefix[color];
+        const unsigned int lane = idx_t % warpsize;
+        const unsigned int c = (lane / bs) % bs;
+        const unsigned int r = lane % bs;
+
+        while(target_block_row < nodesPerColorPrefix[color+1]){
+            const unsigned int first_block = Lrows[target_block_row];
+            const unsigned int last_block = Lrows[target_block_row+1];
+            unsigned int block = first_block + lane / (bs*bs);
+            double local_out = 0.0;
+            if(lane < num_active_threads){
+                if(lane < bs){
+                    local_out = y[target_block_row*bs+lane];
+                }
+                for(; block < last_block; block += num_blocks_per_warp){
+                    const double x_elem = x[Lcols[block]*bs + c];
+                    const double A_elem = Lvals[block*bs*bs + c + r*bs];
+                    local_out -= x_elem * A_elem;
+                }
+            }
+
+            // do reduction in shared mem
+            tmp[lane] = local_out;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+            {
+                if (lane + offset < warpsize)
+                {
+                    tmp[lane] += tmp[lane + offset];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if(lane < bs){
+                const unsigned int row = target_block_row*bs + lane;
+                x[row] = tmp[lane];
+            }
+
+            target_block_row += num_warps_in_grid;
+        }
+    }
+    )";
+
+
+    // ILU apply part 2: backward substitution
+    // solves U*x=y where L is a lower triangular sparse blocked matrix
+    const char* ILU_apply2_s = R"(
+    __kernel void ILU_apply2(
+        __global const double *Uvals,
+        __global const int *Ucols,
+        __global const int *Urows,
+        const unsigned int UrowSize,
+        __global const double *invDiagVals,
+        __global double *x,
+        __global const unsigned int *nodesPerColorPrefix,
+        const unsigned int color,
+        const unsigned int block_size,
+        __local double *tmp)
+    {
+        const unsigned int warpsize = 32;
+        const unsigned int bs = block_size;
+        const unsigned int idx_t = get_local_id(0);
+        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+        const unsigned int NUM_THREADS = get_global_size(0);
+        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+        unsigned int idx = get_global_id(0);
+        unsigned int target_block_row = idx / warpsize;
+        idx += nodesPerColorPrefix[color];
+        target_block_row += nodesPerColorPrefix[color];
+        const unsigned int lane = idx_t % warpsize;
+        const unsigned int c = (lane / bs) % bs;
+        const unsigned int r = lane % bs;
+        const double relaxation = 0.9;
+
+        while(target_block_row < nodesPerColorPrefix[color+1]){
+            const unsigned int first_block = Urows[UrowSize-target_block_row-1];
+            const unsigned int last_block = Urows[UrowSize-target_block_row];
+            unsigned int block = first_block + lane / (bs*bs);
+            double local_out = 0.0;
+            if(lane < num_active_threads){
+                if(lane < bs){
+                    const unsigned int row = target_block_row*bs+lane;
+                    local_out = x[row];
+                }
+                for(; block < last_block; block += num_blocks_per_warp){
+                    const double x_elem = x[Ucols[block]*bs + c];
+                    const double A_elem = Uvals[block*bs*bs + c + r*bs];
+                    local_out -= x_elem * A_elem;
+                }
+            }
+
+            // do reduction in shared mem
+            tmp[lane] = local_out;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+            {
+                if (lane + offset < warpsize)
+                {
+                    tmp[lane] += tmp[lane + offset];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+            local_out = tmp[lane];
+
+            if(lane < bs){
+                tmp[lane + bs*idx_t/warpsize] = local_out;
+                double sum = 0.0;
+                for(int i = 0; i < bs; ++i){
+                    sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                }
+
+                const unsigned int row = target_block_row*bs + lane;
+                x[row] = relaxation * sum;
+            }
+
+            target_block_row += num_warps_in_grid;
+        }
+    }
+    )";
+
+
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -0,0 +1,763 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+
+#include <config.h>  // CMake
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+#include <opm/simulators/linalg/bda/openclKernels.hpp>
+
+
+#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+
+
+// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
+// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
+#define COPY_ROW_BY_ROW 0
+
+
+#define LEVEL_SCHEDULING 1
+#define GRAPH_COLORING   0
+
+namespace bda
+{
+
+using Opm::OpmLog;
+
+openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {
+    prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
+}
+
+
+openclSolverBackend::~openclSolverBackend() {
+    finalize();
+}
+
+
+// divide A by B, and round up: return (int)ceil(A/B)
+unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B)
+{
+    return A / B + (A % B > 0);
+}
+
+// just for verifying and debugging
+bool equal(float a, float b)
+{
+    const float tol_abs = 1e-2;
+    const float tol_rel = 1e-2;
+    return std::abs(a - b) <= std::max(tol_rel * std::max(std::abs(a), std::abs(b)), tol_abs);
+}
+
+double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
+{
+    double t1 = 0.0, t2 = 0.0;
+    const unsigned int work_group_size = 1024;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    if (verbosity >= 4) {
+        t1 = second();
+    }
+
+    cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
+
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
+
+    double gpu_sum = 0.0;
+    for (unsigned int i = 0; i < num_work_groups; ++i) {
+        gpu_sum += tmp[i];
+    }
+
+    if (verbosity >= 4) {
+        event.wait();
+        t2 = second();
+        std::ostringstream oss;
+        oss << "openclSolver dot_w time: " << t2 - t1;
+        OpmLog::info(oss.str());
+    }
+
+    return gpu_sum;
+}
+
+double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
+{
+    double t1 = 0.0, t2 = 0.0;
+    const unsigned int work_group_size = 1024;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    if (verbosity >= 4) {
+        t1 = second();
+    }
+
+    cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
+
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
+
+    double gpu_norm = 0.0;
+    for (unsigned int i = 0; i < num_work_groups; ++i) {
+        gpu_norm += tmp[i];
+    }
+    gpu_norm = sqrt(gpu_norm);
+
+    if (verbosity >= 4) {
+        event.wait();
+        t2 = second();
+        std::ostringstream oss;
+        oss << "openclSolver norm_w time: " << t2 - t1;
+        OpmLog::info(oss.str());
+    }
+
+    return gpu_norm;
+}
+
+void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
+{
+    double t1 = 0.0, t2 = 0.0;
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    if (verbosity >= 4) {
+        t1 = second();
+    }
+
+    cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        t2 = second();
+        std::ostringstream oss;
+        oss << "openclSolver axpy_w time: " << t2 - t1;
+        OpmLog::info(oss.str());
+    }
+}
+
+void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
+{
+    double t1 = 0.0, t2 = 0.0;
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    if (verbosity >= 4) {
+        t1 = second();
+    }
+
+    cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
+
+    if (verbosity >= 4) {
+        event.wait();
+        t2 = second();
+        std::ostringstream oss;
+        oss << "openclSolver custom_w time: " << t2 - t1;
+        OpmLog::info(oss.str());
+    }
+}
+
+void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
+{
+    double t1 = 0.0, t2 = 0.0;
+    const unsigned int work_group_size = 32;
+    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    if (verbosity >= 4) {
+        t1 = second();
+    }
+
+    cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+
+    if (verbosity >= 4) {
+        event.wait();
+        t2 = second();
+        std::ostringstream oss;
+        oss << "openclSolver spmv_blocked_w time: " << t2 - t1;
+        OpmLog::info(oss.str());
+    }
+}
+
+
+void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+
+    float it;
+    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
+    double norm, norm_0;
+
+    double t_total1, t_total2, t1 = 0.0, t2 = 0.0;
+    double prec_time = 0.0, spmv_time = 0.0, well_time = 0.0, rest_time = 0.0;
+    t_total1 = second();
+
+    wellContribs.setOpenCLQueue(queue.get());
+    wellContribs.setReordering(toOrder, true);
+
+    // set r to the initial residual
+    // if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
+    //applyblockedscaleadd(-1.0, mat, x, r);
+
+    // set initial values
+    cl::Event event;
+    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
+    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+    rho = 1.0;
+    alpha = 1.0;
+    omega = 1.0;
+
+    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+
+    norm = norm_w(d_r, d_tmp);
+    norm_0 = norm;
+
+    if (verbosity > 1) {
+        std::ostringstream out;
+        out << std::scientific << "openclSolver initial norm: " << norm_0;
+        OpmLog::info(out.str());
+    }
+
+    t1 = second();
+    for (it = 0.5; it < maxit; it += 0.5) {
+        rhop = rho;
+        rho = dot_w(d_rw, d_r, d_tmp);
+
+        if (it > 1) {
+            beta = (rho / rhop) * (alpha / omega);
+            custom_w(d_p, d_v, d_r, omega, beta);
+        }
+        t2 = second();
+        rest_time += t2 - t1;
+
+        // pw = prec(p)
+        t1 = second();
+        prec->apply(d_p, d_pw);
+        t2 = second();
+        prec_time += t2 - t1;
+
+        // v = A * pw
+        t1 = second();
+        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
+        t2 = second();
+        spmv_time += t2 - t1;
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            t1 = second();
+            wellContribs.apply(d_pw, d_v);
+            t2 = second();
+            well_time += t2 - t1;
+        }
+
+        t1 = second();
+        tmp1 = dot_w(d_rw, d_v, d_tmp);
+        alpha = rho / tmp1;
+        axpy_w(d_v, -alpha, d_r);      // r = r - alpha * v
+        axpy_w(d_pw, alpha, d_x);      // x = x + alpha * pw
+        norm = norm_w(d_r, d_tmp);
+        t2 = second();
+        rest_time += t2 - t1;
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }
+
+        it += 0.5;
+
+        // s = prec(r)
+        t1 = second();
+        prec->apply(d_r, d_s);
+        t2 = second();
+        prec_time += t2 - t1;
+
+        // t = A * s
+        t1 = second();
+        spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
+        t2 = second();
+        spmv_time += t2 - t1;
+
+        // apply wellContributions
+        if (wellContribs.getNumWells() > 0) {
+            t1 = second();
+            wellContribs.apply(d_s, d_t);
+            t2 = second();
+            well_time += t2 - t1;
+        }
+
+        t1 = second();
+        tmp1 = dot_w(d_t, d_r, d_tmp);
+        tmp2 = dot_w(d_t, d_t, d_tmp);
+        omega = tmp1 / tmp2;
+        axpy_w(d_s, omega, d_x);     // x = x + omega * s
+        axpy_w(d_t, -omega, d_r);    // r = r - omega * t
+        norm = norm_w(d_r, d_tmp);
+        t2 = second();
+        rest_time += t2 - t1;
+
+        if (norm < tolerance * norm_0) {
+            break;
+        }
+
+        if (verbosity > 1) {
+            std::ostringstream out;
+            out << "it: " << it << std::scientific << ", norm: " << norm;
+            OpmLog::info(out.str());
+        }
+    }
+
+    t2 = second();
+    t_total2 = second();
+    rest_time += t2 - t1;
+
+    res.iterations = std::min(it, (float)maxit);
+    res.reduction = norm / norm_0;
+    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / it));
+    res.elapsed = t_total2 - t_total1;
+    res.converged = (it != (maxit + 0.5));
+
+    if (verbosity > 0) {
+        std::ostringstream out;
+        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        OpmLog::info(out.str());
+    }
+    if (verbosity >= 4) {
+        std::ostringstream out;
+        out << "openclSolver::ily_apply:   " << prec_time << "s\n";
+        out << "openclSolver::spmv:        " << spmv_time << "s\n";
+        out << "openclSolver::rest:        " << rest_time << "s\n";
+        out << "openclSolver::total_solve: " << res.elapsed << "s\n";
+        OpmLog::info(out.str());
+    }
+}
+
+
+void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
+    this->N = N_;
+    this->nnz = nnz_;
+    this->block_size = dim;
+    this->nnzb = nnz_ / block_size / block_size;
+
+    Nb = (N + dim - 1) / dim;
+    std::ostringstream out;
+    out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    OpmLog::info(out.str());
+    out.str("");
+    out.clear();
+
+    int deviceID = 0;
+
+    cl_int err = CL_SUCCESS;
+    try {
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        if (platforms.size() == 0)
+        {
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
+        }
+        out << "Found " << platforms.size() << " OpenCL platforms" << "\n\n";
+
+        if (verbosity >= 1) {
+            std::string platform_info;
+            for (unsigned int platformID = 0; platformID < platforms.size(); ++platformID) {
+                platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
+                out << "Platform name      : " << platform_info << "\n";
+                platforms[platformID].getInfo(CL_PLATFORM_VENDOR, &platform_info);
+                out << "Platform vendor    : " << platform_info << "\n";
+                platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
+                out << "Platform version   : " << platform_info << "\n";
+                platforms[platformID].getInfo(CL_PLATFORM_PROFILE, &platform_info);
+                out << "Platform profile   : " << platform_info << "\n";
+                platforms[platformID].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
+                out << "Platform extensions: " << platform_info << "\n\n";
+            }
+        }
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+
+        cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[deviceID])(), 0};
+        context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
+
+        std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
+        if (devices.size() == 0){
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
+        }
+        out << "Found " << devices.size() << " OpenCL devices" << "\n\n";
+
+        if (verbosity >= 1) {
+            for (unsigned int i = 0; i < devices.size(); ++i) {
+                std::string device_info;
+                std::vector<size_t> work_sizes;
+                std::vector<cl_device_partition_property> partitions;
+
+                devices[i].getInfo(CL_DEVICE_NAME, &device_info);
+                out << "CL_DEVICE_NAME            : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
+                out << "CL_DEVICE_VENDOR          : " << device_info << "\n";
+                devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
+                out << "CL_DRIVER_VERSION         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
+                out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_VERSION, &device_info);
+                out << "CL_DEVICE_VERSION         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
+                out << "CL_DEVICE_PROFILE         : " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
+                out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
+                devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
+                out << "CL_DEVICE_EXTENSIONS      : " << device_info << "\n";
+
+                devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
+                for (unsigned int j = 0; j < work_sizes.size(); ++j) {
+                    out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
+                }
+                devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
+                for (unsigned int j = 0; j < partitions.size(); ++j) {
+                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
+                }
+                partitions.clear();
+                devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
+                for (unsigned int j = 0; j < partitions.size(); ++j) {
+                    out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
+                }
+
+                // C-style properties
+                cl_device_id tmp_id = devices[i]();
+                cl_ulong size;
+                clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_LOCAL_MEM_SIZE       : " << size / 1024 << " KB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_COMPUTE_UNITS    : " << size << "\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE   : " << size / 1024 / 1024 << " MB\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_MAX_WORK_GROUP_SIZE  : " << size << "\n";
+                clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
+                out << "CL_DEVICE_GLOBAL_MEM_SIZE      : " << size / 1024 / 1024 / 1024 << " GB\n\n";
+            }
+        }
+        OpmLog::info(out.str());
+
+        cl::Program::Sources source(1, std::make_pair(kernel_1, strlen(kernel_1)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
+        source.emplace_back(std::make_pair(kernel_2, strlen(kernel_2)));
+        source.emplace_back(std::make_pair(axpy_s, strlen(axpy_s)));
+        source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
+        source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
+        source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
+        source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
+        source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
+        source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
+        cl::Program program_ = cl::Program(*context, source);
+
+        program_.build(devices);
+
+        cl::Event event;
+        queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
+
+        prec->setOpenCLContext(context.get());
+        prec->setOpenCLQueue(queue.get());
+
+        rb = new double[N];
+        tmp = new double[N];
+#if COPY_ROW_BY_ROW
+        vals_contiguous = new double[N];
+#endif
+
+        mat = (BlockedMatrix *)malloc(sizeof(BlockedMatrix));
+        mat->Nb = Nb;
+        mat->nnzbs = nnzb;
+        mat->nnzValues = (Block*)vals;
+        mat->colIndices = cols;
+        mat->rowPointers = rows;
+
+        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+
+        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
+        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
+        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
+
+        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
+        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
+
+        // actually creating the kernels
+        dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "dot_1")));
+        norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "norm")));
+        axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program_, "axpy")));
+        custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program_, "custom")));
+        spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
+        ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
+        ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
+
+        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
+
+    } catch (cl::Error error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")";
+        OpmLog::error(oss.str());
+    }
+
+
+    initialized = true;
+} // end initialize()
+
+void openclSolverBackend::finalize() {
+    delete[] rb;
+    delete[] tmp;
+#if COPY_ROW_BY_ROW
+    delete[] vals_contiguous;
+#endif
+} // end finalize()
+
+
+void openclSolverBackend::copy_system_to_gpu() {
+
+    double t1 = 0.0, t2 = 0.0;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    cl::Event event;
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
+        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+#else
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+#endif
+
+    queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
+    queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
+    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::copy_system_to_gpu(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end copy_system_to_gpu()
+
+
+// don't copy rowpointers and colindices, they stay the same
+void openclSolverBackend::update_system_on_gpu() {
+
+    double t1 = 0.0, t2 = 0.0;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    cl::Event event;
+
+#if COPY_ROW_BY_ROW
+    int sum = 0;
+    for (int i = 0; i < Nb; ++i) {
+        int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
+        memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
+        sum += size_row * block_size * block_size;
+    }
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
+#else
+    queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
+#endif
+
+    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
+    event.wait();
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::update_system_on_gpu(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system_on_gpu()
+
+
+bool openclSolverBackend::analyse_matrix() {
+
+    double t1 = 0.0, t2 = 0.0;
+
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    bool success = prec->init(mat, block_size);
+    int work_group_size = 32;
+    int num_work_groups = ceilDivision(N, work_group_size);
+    int total_work_items = num_work_groups * work_group_size;
+    int lmem_per_work_group = work_group_size * sizeof(double);
+    prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
+
+    toOrder = prec->getToOrder();
+    fromOrder = prec->getFromOrder();
+    rmat = prec->getRMat();
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::analyse_matrix(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+
+    analysis_done = true;
+
+    return success;
+} // end analyse_matrix()
+
+
+void openclSolverBackend::update_system(double *vals, double *b) {
+    double t1 = 0.0, t2 = 0.0;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    mat->nnzValues = (Block*)vals;
+    //mat->nnzValues = static_cast<Block*>(vals);
+    blocked_reorder_vector_by_pattern(mat->Nb, b, fromOrder, rb);
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::update_system(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end update_system()
+
+
+bool openclSolverBackend::create_preconditioner() {
+
+    double t1 = 0.0, t2 = 0.0;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    bool result = prec->create_preconditioner(mat);
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::create_preconditioner(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+    return result;
+} // end create_preconditioner()
+
+
+void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
+    // actually solve
+    double t1 = 0.0, t2 = 0.0;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    gpu_pbicgstab(wellContribs, res);
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::solve_system(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+
+} // end solve_system()
+
+
+// copy result to host memory
+// caller must be sure that x is a valid array
+void openclSolverBackend::get_result(double *x) {
+
+    double t1 = 0.0, t2 = 0.0;
+    if (verbosity > 2) {
+        t1 = second();
+    }
+
+    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
+    blocked_reorder_vector_by_pattern(mat->Nb, rb, toOrder, x);
+
+    if (verbosity > 2) {
+        t2 = second();
+        std::ostringstream out;
+        out << "openclSolver::get_result(): " << t2 - t1 << " s";
+        OpmLog::info(out.str());
+    }
+} // end get_result()
+
+
+
+typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
+
+BdaSolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+    if (initialized == false) {
+        initialize(N_, nnz_,  dim, vals, rows, cols);
+        if (analysis_done == false) {
+            if (!analyse_matrix()) {
+                return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+            }
+        }
+        update_system(vals, b);
+        if (!create_preconditioner()) {
+            return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        copy_system_to_gpu();
+    } else {
+        update_system(vals, b);
+        if (!create_preconditioner()) {
+            return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        update_system_on_gpu();
+    }
+    solve_system(wellContribs, res);
+    return BdaSolverStatus::BDA_SOLVER_SUCCESS;
+}
+
+}
+
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -0,0 +1,197 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
+
+#if HAVE_OPENCL
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>            // up to OpenCL 1.2
+#endif
+
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <opm/simulators/linalg/bda/BILU0.hpp>
+typedef bda::BILU0 Preconditioner;
+
+namespace bda
+{
+
+/// This class implements a opencl-based ilu0-bicgstab solver on GPU
+class openclSolverBackend : public BdaSolver
+{
+
+private:
+
+    double *rb;                 // reordered b vector, the matrix is reordered, so b must also be
+    double *vals_contiguous;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
+
+    bool analysis_done = false;
+
+    // OpenCL variables must be reusable, they are initialized in initialize()
+    cl::Buffer d_Avals, d_Acols, d_Arows;        // (reordered) matrix in BSR format on GPU
+    cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p;   // vectors, used during linear solve
+    cl::Buffer d_pw, d_s, d_t, d_v;              // vectors, used during linear solve
+    cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
+    double *tmp;                                 // used as tmp CPU buffer for dot() and norm()
+
+    // shared pointers are also passed to BILU0
+    std::shared_ptr<cl::Context> context;
+    std::shared_ptr<cl::CommandQueue> queue;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
+    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
+    std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
+
+    Preconditioner *prec;        // only supported preconditioner is BILU0
+    int *toOrder, *fromOrder;    // BILU0 reorders rows of the matrix via these mappings
+    BlockedMatrix *mat, *rmat;   // normal and reordered matrix
+
+
+    /// Divide A by B, and round up: return (int)ceil(A/B)
+    /// \param[in] A    dividend
+    /// \param[in] B    divisor
+    /// \return         rounded division result
+    unsigned int ceilDivision(const unsigned int A, const unsigned int B);
+
+    /// Calculate dot product between in1 and in2, partial sums are stored in out, which are summed on CPU
+    /// \param[in] in1           input vector 1
+    /// \param[in] in2           input vector 2
+    /// \param[out] out          output vector containing partial sums
+    /// \return                  dot product
+    double dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out);
+
+    /// Calculate the norm of in, partial sums are stored in out, which are summed on the CPU
+    /// Equal to Dune::DenseVector::two_norm()
+    /// \param[in] in          input vector
+    /// \param[out] out        output vector containing partial sums
+    /// \return                norm
+    double norm_w(cl::Buffer in, cl::Buffer out);
+
+    /// Perform axpy: out += a * in
+    /// \param[in] in         input vector
+    /// \param[in] a          scalar value to multiply input vector
+    /// \param[inout] out     output vector
+    void axpy_w(cl::Buffer in, const double a, cl::Buffer out);
+
+    /// Custom function that combines scale, axpy and add functions in bicgstab
+    /// p = (p - omega * v) * beta + r
+    /// \param[inout] p      output vector
+    /// \param[in] v         input vector
+    /// \param[in] r         input vector
+    /// \param[in] omega     scalar value
+    /// \param[in] beta      scalar value
+    void custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta);
+
+    /// Sparse matrix-vector multiply, spmv
+    /// b = A * x
+    /// Matrix A, must be in BCRS format
+    /// \param[in] vals     nnzs of matrix A
+    /// \param[in] cols     columnindices of matrix A
+    /// \param[in] rows     rowpointers of matrix A
+    /// \param[in] x        input vector
+    /// \param[out] b       output vector
+    void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
+
+    /// Solve linear system using ilu0-bicgstab
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
+
+    /// Initialize GPU and allocate memory
+    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+
+    /// Clean memory
+    void finalize();
+
+    /// Copy linear system to GPU
+    void copy_system_to_gpu();
+
+    /// Reorder the linear system so it corresponds with the coloring
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] b              input vectors, contains N values
+    void update_system(double *vals, double *b);
+
+    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
+    void update_system_on_gpu();
+
+    /// Analyse sparsity pattern to extract parallelism
+    /// \return true iff analysis was successful
+    bool analyse_matrix();
+
+    /// Perform ilu0-decomposition
+    /// \return true iff decomposition was successful
+    bool create_preconditioner();
+
+    /// Solve linear system
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    void solve_system(WellContributions& wellContribs, BdaResult &res);
+
+public:
+
+    /// Construct a openclSolver
+    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
+    /// \param[in] maxit                      maximum number of iterations for openclSolver
+    /// \param[in] tolerance                  required relative tolerance for openclSolver
+    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
+
+    /// Destroy a openclSolver, and free memory
+    ~openclSolverBackend();
+
+    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
+    /// \param[in] N              number of rows, divide by dim to get number of blockrows
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] vals_prec      array of nonzeroes for preconditioner
+    /// \param[in] rows_prec      array of rowPointers for preconditioner
+    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    /// \return                   status code
+    BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+
+    /// Get result after linear solve, and peform postprocessing if necessary
+    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
+    void get_result(double *x) override;
+
+}; // end class openclSolverBackend
+
+} // namespace bda
+
+#endif
+
+