Added block_size template to BdaSolvers and BILU0

2025-02-25 18:55:30 -06:00 · 2020-06-24 20:09:14 +02:00 · 2020-06-24 20:09:14 +02:00 · 98ddf47b44
commit 98ddf47b44
parent 2a48f5f63f
11 changed files with 232 additions and 91 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -150,6 +150,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/BdaBridge.hpp
  opm/simulators/linalg/bda/BdaResult.hpp
  opm/simulators/linalg/bda/BdaSolver.hpp
+  opm/simulators/linalg/bda/BdaSolverStatus.hpp
  opm/simulators/linalg/bda/BILU0.hpp
  opm/simulators/linalg/bda/BlockedMatrix.hpp
  opm/simulators/linalg/bda/cuda_header.hpp
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -35,12 +35,12 @@ namespace bda

    using Opm::OpmLog;

-    BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
-
    // define 'second' as 'BdaSolver<>::second', this allows usage of the second() function for timing
    // typedefs cannot handle templates
    const auto second = BdaSolver<>::second;

+    template <unsigned int block_size>
+    BILU0<block_size>::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
        level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_)
    {
        if (level_scheduling == graph_coloring) {
@ -49,7 +49,8 @@ namespace bda
        double t1 = second();
    }

-    BILU0::~BILU0()
+    template <unsigned int block_size>
+    BILU0<block_size>::~BILU0()
    {
        delete[] invDiagVals;
        delete[] diagIndex;
@ -62,7 +63,8 @@ namespace bda
        freeBlockedMatrix(&rMat);
    }

-    bool BILU0::init(BlockedMatrix *mat, unsigned int block_size)
+    template <unsigned int block_size>
+    bool BILU0<block_size>::init(BlockedMatrix *mat)
    {
        double t1 = 0.0, t2 = 0.0;
        BlockedMatrix *CSCmat = nullptr;
@ -71,7 +73,6 @@ namespace bda
        this->Nb = mat->Nb;
        this->nnz = mat->nnzbs * block_size * block_size;
        this->nnzbs = mat->nnzbs;
-        this->block_size = block_size;

        toOrder = new int[N];
        fromOrder = new int[N];
@ -160,7 +161,8 @@ namespace bda
    } // end init()


-    bool BILU0::create_preconditioner(BlockedMatrix *mat)
+    template <unsigned int block_size>
+    bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    {
        double t1 = 0.0, t2 = 0.0;
        if (verbosity >= 3){
@ -306,13 +308,13 @@ namespace bda
    // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
    // however, if individual kernel calls are timed, waiting for events is needed
    // behavior on other GPUs is untested
-    void BILU0::apply(cl::Buffer& x, cl::Buffer& y)
+    template <unsigned int block_size>
+    void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
    {
        double t1 = 0.0, t2 = 0.0;
        if (verbosity >= 3) {
            t1 = second();
        }
-        const unsigned int block_size = 3;
        cl::Event event;

        for(unsigned int color = 0; color < numColors; ++color){
@ -334,18 +336,22 @@ namespace bda
    }


-    void BILU0::setOpenCLContext(cl::Context *context){
+    template <unsigned int block_size>
+    void BILU0<block_size>::setOpenCLContext(cl::Context *context){
        this->context = context;
    }
-    void BILU0::setOpenCLQueue(cl::CommandQueue *queue){
+    template <unsigned int block_size>
+    void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue){
        this->queue = queue;
    }
-    void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
+    template <unsigned int block_size>
+    void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
        this->work_group_size = work_group_size;
        this->total_work_items = total_work_items;
        this->lmem_per_work_group = lmem_per_work_group;
    }
-    void BILU0::setKernels(
+    template <unsigned int block_size>
+    void BILU0<block_size>::setKernels(
        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
        cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
    ){
@ -353,6 +359,27 @@ namespace bda
        this->ILU_apply2 = ILU_apply2_;
    }

+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                     \
+template BILU0<n>::BILU0(bool, bool, int);                                               \
+template bool BILU0<n>::init(BlockedMatrix*);                                            \
+template bool BILU0<n>::create_preconditioner(BlockedMatrix*);                           \
+template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y);                             \
+template void BILU0<n>::setOpenCLContext(cl::Context*);                                  \
+template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);                               \
+template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int);   \
+template void BILU0<n>::setKernels(                                                      \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
+    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *  \
+    );
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
 } // end namespace bda


--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -36,6 +36,7 @@ namespace bda

    /// This class implementa a Blocked ILU0 preconditioner
    /// The decomposition is done on CPU, and reorders the rows of the matrix
+    template <unsigned int block_size>
    class BILU0
    {

@ -44,7 +45,6 @@ namespace bda
        int Nb;      // number of blockrows of the matrix
        int nnz;     // number of nonzeroes of the matrix (scalar)
        int nnzbs;   // number of blocks of the matrix
-        unsigned int block_size;
        BlockedMatrix *LMat, *UMat, *LUMat;
        BlockedMatrix *rMat = nullptr; // only used with PAR_SIM
        Block *invDiagVals;
@ -79,7 +79,7 @@ namespace bda
        ~BILU0();

        // analysis
-        bool init(BlockedMatrix *mat, unsigned int block_size);
+        bool init(BlockedMatrix *mat);

        // ilu_decomposition
        bool create_preconditioner(BlockedMatrix *mat);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -26,6 +26,7 @@
 #include <opm/material/common/Unused.hpp>

 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
 #include <opm/simulators/linalg/bda/BdaResult.hpp>

 #define PRINT_TIMERS_BRIDGE 0
@ -37,6 +38,7 @@ namespace Opm

    using bda::BdaResult;
    using bda::BdaSolver;
+    using bda::BdaSolverStatus;

 template <class BridgeMatrix, class BridgeVector, int block_size>
 BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance)
@ -44,14 +46,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
    if (gpu_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
-        backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+        backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
    } else if (gpu_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
        use_gpu = true;
-        backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance));
+        backend.reset(new bda::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
@ -178,17 +180,17 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
        /////////////////////////
        // actually solve

-        typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
+        typedef BdaSolverStatus::Status Status;
        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
-        BdaSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
+        Status status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
        switch(status) {
-        case BdaSolverStatus::BDA_SOLVER_SUCCESS:
+        case Status::BDA_SOLVER_SUCCESS:
            //OpmLog::info("BdaSolver converged");
            break;
-        case BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
+        case Status::BDA_SOLVER_ANALYSIS_FAILED:
            OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
            break;
-        case BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
+        case Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
            OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
            break;
        default:
@ -238,6 +240,6 @@ INSTANTIATE_BDA_FUNCTIONS(4);

 #undef INSTANTIATE_BDA_FUNCTIONS

-}
+} // namespace Opm


--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -47,7 +47,7 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 class BdaBridge
 {
 private:
-    std::unique_ptr<bda::BdaSolver> backend;
+    std::unique_ptr<bda::BdaSolver<block_size> > backend;
    bool use_gpu = false;

 public:
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -26,15 +26,19 @@
 #include <sys/time.h>

 #include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BdaSolverStatus.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

 namespace bda
 {

    using Opm::WellContributions;
+    typedef BdaSolverStatus::Status Status;

    /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
    /// This class is abstract, no instantiations can of it can be made, only of its children
+    /// Without a default block_size value, the BILU0 class cannot use BdaSolver::second()
+    template <unsigned int block_size = 3>
    class BdaSolver
    {

@ -55,26 +59,18 @@ namespace bda
        int Nb;          // number of blocked rows (Nb*block_size == N)
        int nnz;         // number of nonzeroes (scalars)
        int nnzb;        // number of nonzero blocks (nnzb*block_size*block_size == nnz)
-        int block_size;  // size of block

        bool initialized = false;

    public:

-        enum class BdaSolverStatus {
-            BDA_SOLVER_SUCCESS,
-            BDA_SOLVER_ANALYSIS_FAILED,
-            BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
-            BDA_SOLVER_UNKNOWN_ERROR
-        };
-
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};

        /// Define virtual destructor, so that the derivedclass destructor will be called
        virtual ~BdaSolver() {};

        /// Define as pure virtual functions, so derivedclass must implement them
-        virtual BdaSolverStatus solve_system(int N, int nnz, int dim,
+        virtual Status solve_system(int N, int nnz, int dim,
            double *vals, int *rows, int *cols,
            double *b, WellContributions& wellContribs, BdaResult &res) = 0;

--- a/opm/simulators/linalg/bda/BdaSolverStatus.hpp
+++ b/opm/simulators/linalg/bda/BdaSolverStatus.hpp
@ -0,0 +1,42 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_BDASOLVERSTATUS_HEADER_INCLUDED
+#define OPM_BDASOLVERSTATUS_HEADER_INCLUDED
+
+namespace bda
+{
+
+    class BdaSolverStatus
+    {
+
+    public:
+
+        enum class Status {
+            BDA_SOLVER_SUCCESS,
+            BDA_SOLVER_ANALYSIS_FAILED,
+            BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
+            BDA_SOLVER_UNKNOWN_ERROR
+        };
+
+    }; // end class BdaSolverStatus
+
+} // end namespace bda
+
+#endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -51,13 +51,17 @@ const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
 const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
 const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;

-cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {}

-cusparseSolverBackend::~cusparseSolverBackend() {
+template <unsigned int block_size>
+cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {}
+
+template <unsigned int block_size>
+cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
    finalize();
 }

-void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
    double t_total1, t_total2;
    int n = N;
    double rho = 1.0, rhop;
@ -188,10 +192,10 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaRe
 }


-void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
    this->N = N;
    this->nnz = nnz;
-    this->block_size = dim;
    this->nnzb = nnz / block_size / block_size;
    Nb = (N + dim - 1) / dim;
    std::ostringstream out;
@ -250,7 +254,8 @@ void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
    initialized = true;
 } // end initialize()

-void cusparseSolverBackend::finalize() {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::finalize() {
    if (initialized) {
        cudaFree(d_x);
        cudaFree(d_b);
@ -283,7 +288,8 @@ void cusparseSolverBackend::finalize() {
 } // end finalize()


-void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {

    double t1, t2;
    if (verbosity > 2) {
@ -318,7 +324,8 @@ void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *col


 // don't copy rowpointers and colindices, they stay the same
-void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {

    double t1, t2;
    if (verbosity > 2) {
@ -350,12 +357,14 @@ void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double
 } // end update_system_on_gpu()


-void cusparseSolverBackend::reset_prec_on_gpu() {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
    cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
 }


-bool cusparseSolverBackend::analyse_matrix() {
+template <unsigned int block_size>
+bool cusparseSolverBackend<block_size>::analyse_matrix() {

    int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    double t1, t2;
@ -436,7 +445,8 @@ bool cusparseSolverBackend::analyse_matrix() {
    return true;
 } // end analyse_matrix()

-bool cusparseSolverBackend::create_preconditioner() {
+template <unsigned int block_size>
+bool cusparseSolverBackend<block_size>::create_preconditioner() {

    double t1, t2;
    if (verbosity > 2) {
@ -468,7 +478,8 @@ bool cusparseSolverBackend::create_preconditioner() {
 } // end create_preconditioner()


-void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
    // actually solve
    gpu_pbicgstab(wellContribs, res);
    cudaStreamSynchronize(stream);
@ -478,7 +489,8 @@ void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaRes

 // copy result to host memory
 // caller must be sure that x is a valid array
-void cusparseSolverBackend::get_result(double *x) {
+template <unsigned int block_size>
+void cusparseSolverBackend<block_size>::get_result(double *x) {

    double t1, t2;
    if (verbosity > 2) {
@ -497,9 +509,10 @@ void cusparseSolverBackend::get_result(double *x) {
 } // end get_result()


-typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
+typedef BdaSolverStatus::Status Status;

-BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+template <unsigned int block_size>
+Status cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
    if (initialized == false) {
        initialize(N, nnz, dim);
        copy_system_to_gpu(vals, rows, cols, b);
@ -508,19 +521,29 @@ BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, dou
    }
    if (analysis_done == false) {
        if (!analyse_matrix()) {
-            return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+            return Status::BDA_SOLVER_ANALYSIS_FAILED;
        }
    }
    reset_prec_on_gpu();
    if (create_preconditioner()) {
        solve_system(wellContribs, res);
    } else {
-        return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
    }
-    return BdaSolverStatus::BDA_SOLVER_SUCCESS;
+    return Status::BDA_SOLVER_SUCCESS;
 }


-}
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                         \
+template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda


--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@ -32,7 +32,21 @@ namespace bda
 {

 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-class cusparseSolverBackend : public BdaSolver {
+template <unsigned int block_size>
+class cusparseSolverBackend : public BdaSolver<block_size> {
+
+    typedef BdaSolver<block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::maxit;
+    using Base::tolerance;
+    using Base::second;
+    using Base::initialized;
+    typedef BdaSolverStatus::Status Status;

 private:

@ -120,7 +134,7 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+    Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;

    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -51,31 +51,28 @@ namespace bda

 using Opm::OpmLog;

-openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {
+template <unsigned int block_size>
+openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
    prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
 }


-openclSolverBackend::~openclSolverBackend() {
+template <unsigned int block_size>
+openclSolverBackend<block_size>::~openclSolverBackend() {
    finalize();
 }


 // divide A by B, and round up: return (int)ceil(A/B)
-unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B)
+template <unsigned int block_size>
+unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
 {
    return A / B + (A % B > 0);
 }

-// just for verifying and debugging
-bool equal(float a, float b)
-{
-    const float tol_abs = 1e-2;
-    const float tol_rel = 1e-2;
-    return std::abs(a - b) <= std::max(tol_rel * std::max(std::abs(a), std::abs(b)), tol_abs);
-}

-double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
+template <unsigned int block_size>
+double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
 {
    double t1 = 0.0, t2 = 0.0;
    const unsigned int work_group_size = 1024;
@ -106,7 +103,8 @@ double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out
    return gpu_sum;
 }

-double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
+template <unsigned int block_size>
+double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
 {
    double t1 = 0.0, t2 = 0.0;
    const unsigned int work_group_size = 1024;
@ -138,7 +136,8 @@ double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
    return gpu_norm;
 }

-void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
 {
    double t1 = 0.0, t2 = 0.0;
    const unsigned int work_group_size = 32;
@ -159,7 +158,8 @@ void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
    }
 }

-void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
 {
    double t1 = 0.0, t2 = 0.0;
    const unsigned int work_group_size = 32;
@ -180,7 +180,8 @@ void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, con
    }
 }

-void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
 {
    double t1 = 0.0, t2 = 0.0;
    const unsigned int work_group_size = 32;
@ -203,7 +204,8 @@ void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::B
 }


-void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {

    float it;
    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
@ -360,10 +362,10 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResu
 }


-void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
    this->N = N_;
    this->nnz = nnz_;
-    this->block_size = dim;
    this->nnzb = nnz_ / block_size / block_size;

    Nb = (N + dim - 1) / dim;
@ -542,7 +544,9 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, in
    initialized = true;
 } // end initialize()

-void openclSolverBackend::finalize() {
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::finalize() {
    delete[] rb;
    delete[] tmp;
 #if COPY_ROW_BY_ROW
@ -551,7 +555,8 @@ void openclSolverBackend::finalize() {
 } // end finalize()


-void openclSolverBackend::copy_system_to_gpu() {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::copy_system_to_gpu() {

    double t1 = 0.0, t2 = 0.0;
    if (verbosity > 2) {
@ -588,7 +593,8 @@ void openclSolverBackend::copy_system_to_gpu() {


 // don't copy rowpointers and colindices, they stay the same
-void openclSolverBackend::update_system_on_gpu() {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::update_system_on_gpu() {

    double t1 = 0.0, t2 = 0.0;
    if (verbosity > 2) {
@ -622,7 +628,8 @@ void openclSolverBackend::update_system_on_gpu() {
 } // end update_system_on_gpu()


-bool openclSolverBackend::analyse_matrix() {
+template <unsigned int block_size>
+bool openclSolverBackend<block_size>::analyse_matrix() {

    double t1 = 0.0, t2 = 0.0;

@ -630,7 +637,7 @@ bool openclSolverBackend::analyse_matrix() {
        t1 = second();
    }

-    bool success = prec->init(mat, block_size);
+    bool success = prec->init(mat);
    int work_group_size = 32;
    int num_work_groups = ceilDivision(N, work_group_size);
    int total_work_items = num_work_groups * work_group_size;
@ -654,7 +661,8 @@ bool openclSolverBackend::analyse_matrix() {
 } // end analyse_matrix()


-void openclSolverBackend::update_system(double *vals, double *b) {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
    double t1 = 0.0, t2 = 0.0;
    if (verbosity > 2) {
        t1 = second();
@ -673,7 +681,8 @@ void openclSolverBackend::update_system(double *vals, double *b) {
 } // end update_system()


-bool openclSolverBackend::create_preconditioner() {
+template <unsigned int block_size>
+bool openclSolverBackend<block_size>::create_preconditioner() {

    double t1 = 0.0, t2 = 0.0;
    if (verbosity > 2) {
@ -692,7 +701,8 @@ bool openclSolverBackend::create_preconditioner() {
 } // end create_preconditioner()


-void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
    // actually solve
    double t1 = 0.0, t2 = 0.0;
    if (verbosity > 2) {
@ -713,7 +723,8 @@ void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResul

 // copy result to host memory
 // caller must be sure that x is a valid array
-void openclSolverBackend::get_result(double *x) {
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::get_result(double *x) {

    double t1 = 0.0, t2 = 0.0;
    if (verbosity > 2) {
@ -732,32 +743,43 @@ void openclSolverBackend::get_result(double *x) {
 } // end get_result()


+typedef BdaSolverStatus::Status Status;

-typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
-
-BdaSolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
+template <unsigned int block_size>
+Status openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
    if (initialized == false) {
        initialize(N_, nnz_,  dim, vals, rows, cols);
        if (analysis_done == false) {
            if (!analyse_matrix()) {
-                return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+                return Status::BDA_SOLVER_ANALYSIS_FAILED;
            }
        }
        update_system(vals, b);
        if (!create_preconditioner()) {
-            return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+            return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
        copy_system_to_gpu();
    } else {
        update_system(vals, b);
        if (!create_preconditioner()) {
-            return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+            return Status::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
        }
        update_system_on_gpu();
    }
    solve_system(wellContribs, res);
-    return BdaSolverStatus::BDA_SOLVER_SUCCESS;
+    return Status::BDA_SOLVER_SUCCESS;
 }

-}
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                     \
+template openclSolverBackend<n>::openclSolverBackend(int, int, double);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda

--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -31,15 +31,29 @@
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

 #include <opm/simulators/linalg/bda/BILU0.hpp>
-typedef bda::BILU0 Preconditioner;

 namespace bda
 {

 /// This class implements a opencl-based ilu0-bicgstab solver on GPU
-class openclSolverBackend : public BdaSolver
+template <unsigned int block_size>
+class openclSolverBackend : public BdaSolver<block_size>
 {

+    typedef BdaSolver<block_size> Base;
+    typedef BILU0<block_size> Preconditioner;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::maxit;
+    using Base::tolerance;
+    using Base::second;
+    using Base::initialized;
+    typedef BdaSolverStatus::Status Status;
+
 private:

    double *rb;                 // reordered b vector, the matrix is reordered, so b must also be
@ -182,7 +196,7 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+    Status solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;

    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array