openclSolverBackend: template Scalar type

2025-02-25 18:55:30 -06:00 · 2024-04-16 10:29:33 +02:00 · 2024-04-16 10:29:33 +02:00 · 18f42b51b2
commit 18f42b51b2
parent d0773ef4f7
4 changed files with 182 additions and 165 deletions
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -80,7 +80,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
    } else if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
        use_gpu = true;
-        backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
+        using OCL = Accelerator::openclSolverBackend<double,block_size>;
+        backend = std::make_unique<OCL>(linear_solver_verbosity,
+                                        maxit,
+                                        tolerance,
+                                        platformID,
+                                        deviceID,
+                                        opencl_ilu_parallel,
+                                        linsolver);
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
@ -307,11 +314,13 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
 initWellContributions([[maybe_unused]] WellContributions<double>& wellContribs,
                      [[maybe_unused]] unsigned N)
 {
-    if(accelerator_mode.compare("opencl") == 0){
+    if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
-        const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
-        static_cast<WellContributionsOCL<double>&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
-                                                                              openclBackend->queue.get());
+        using OCL = Accelerator::openclSolverBackend<double,block_size>;
+        const auto openclBackend = static_cast<const OCL*>(backend.get());
+        using WCOCL = WellContributionsOCL<double>;
+        static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
+                                                       openclBackend->queue.get());
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@ -88,7 +88,7 @@ private:
    unsigned num_pre_smooth_steps;              // number of Jacobi smooth steps before restriction
    unsigned num_post_smooth_steps;             // number of Jacobi smooth steps after prolongation

-    std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
+    std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
    bool opencl_ilu_parallel;                   // whether ILU0 operation should be parallelized

    // Analyze the AMG hierarchy build by Dune
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@ -37,19 +37,15 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-openclSolverBackend<block_size>::
+template<class Scalar, unsigned int block_size>
+openclSolverBackend<Scalar,block_size>::
 openclSolverBackend(int verbosity_,
                    int maxit_,
-                    double tolerance_,
+                    Scalar tolerance_,
                    unsigned int platformID_,
                    unsigned int deviceID_,
                    bool opencl_ilu_parallel_,
@ -59,22 +55,23 @@ openclSolverBackend(int verbosity_,
 {
    bool use_cpr, use_isai;

-    if (linsolver.compare("ilu0") == 0) {
+    if (linsolver == "ilu0") {
        use_cpr = false;
        use_isai = false;
-    } else if (linsolver.compare("cpr_quasiimpes") == 0) {
+    } else if (linsolver == "cpr_quasiimpes") {
        use_cpr = true;
        use_isai = false;
-    } else if (linsolver.compare("isai") == 0) {
+    } else if (linsolver == "isai") {
        use_cpr = false;
        use_isai = true;
-    } else if (linsolver.compare("cpr_trueimpes") == 0) {
-        OPM_THROW(std::logic_error, "Error openclSolver does not support --linerar-solver=cpr_trueimpes");
+    } else if (linsolver == "cpr_trueimpes") {
+        OPM_THROW(std::logic_error, "Error openclSolver does not support "
+                                    "--linear-solver=cpr_trueimpes");
    } else {
        OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
    }

-    using PreconditionerType = Preconditioner<double,block_size>;
+    using PreconditionerType = Preconditioner<Scalar,block_size>;
    if (use_cpr) {
        prec = PreconditionerType::create(PreconditionerType::Type::CPR,
                                          opencl_ilu_parallel, verbosity);
@ -115,7 +112,7 @@ openclSolverBackend(int verbosity_,
        out.clear();

        if (platforms.size() <= platformID) {
-            OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
+            OPM_THROW(std::logic_error, "Error: Invalid OpenCL platform ID selected");
        } else {
            std::string platform_info;
            out << "Chosen:\n";
@ -131,7 +128,8 @@ openclSolverBackend(int verbosity_,
        platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);

        if (devices.empty()) {
-            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but "
+                                        "no OpenCL devices are found");
        }
        out << "Found " << devices.size() << " OpenCL devices" << "\n";

@ -215,8 +213,7 @@ openclSolverBackend(int verbosity_,
        context = std::make_shared<cl::Context>(devices[0]);
        queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));

-        OpenclKernels<double>::init(context.get(), queue.get(), devices, verbosity);
-
+        OpenclKernels<Scalar>::init(context.get(), queue.get(), devices, verbosity);
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -229,10 +226,10 @@ openclSolverBackend(int verbosity_,
    }
 }

-template <unsigned int block_size>
-openclSolverBackend<block_size>::
+template<class Scalar, unsigned int block_size>
+openclSolverBackend<Scalar,block_size>::
 openclSolverBackend(int verbosity_, int maxit_,
-                    double tolerance_, bool opencl_ilu_parallel_)
+                    Scalar tolerance_, bool opencl_ilu_parallel_)
    : Base(verbosity_, maxit_, tolerance_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
@ -240,20 +237,22 @@ openclSolverBackend(int verbosity_, int maxit_,
    // cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
 }

-template <unsigned int block_size>
-void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_,
+          std::shared_ptr<cl::CommandQueue>& queue_)
+{
    context = context_;
    queue = queue_;
 }

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::
-gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
    float it;
-    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
-    double norm, norm_0;
+    Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
+    Scalar norm, norm_0;

    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);

@ -263,15 +262,15 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

    // set initial values
    events.resize(5);
-    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
-    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
+    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[0]);
+    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(Scalar) * N, nullptr, &events[1]);
    rho = 1.0;
    alpha = 1.0;
    omega = 1.0;

-    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
-    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
-    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
+    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
+    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(Scalar) * N, nullptr, &events[3]);
+    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);

    cl::WaitForEvents(events);
    events.clear();
@ -280,7 +279,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
    }

-    norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
+    norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
    norm_0 = norm;

    if (verbosity > 1) {
@ -294,11 +293,11 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
    }
    for (it = 0.5; it < maxit; it += 0.5) {
        rhop = rho;
-        rho = OpenclKernels<double>::dot(d_rw, d_r, d_tmp, N);
+        rho = OpenclKernels<Scalar>::dot(d_rw, d_r, d_tmp, N);

        if (it > 1) {
            beta = (rho / rhop) * (alpha / omega);
-            OpenclKernels<double>::custom(d_p, d_v, d_r, omega, beta, N);
+            OpenclKernels<Scalar>::custom(d_p, d_v, d_r, omega, beta, N);
        }
        if (verbosity >= 3) {
            queue->finish();
@ -315,7 +314,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        }

        // v = A * pw
-        OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
+        OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
        if (verbosity >= 3) {
            queue->finish();
            t_spmv.stop();
@ -324,7 +323,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_pw, d_v);
        }
        if (verbosity >= 3) {
            queue->finish();
@ -332,11 +331,11 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
            t_rest.start();
        }

-        tmp1 = OpenclKernels<double>::dot(d_rw, d_v, d_tmp, N);
+        tmp1 = OpenclKernels<Scalar>::dot(d_rw, d_v, d_tmp, N);
        alpha = rho / tmp1;
-        OpenclKernels<double>::axpy(d_v, -alpha, d_r, N);      // r = r - alpha * v
-        OpenclKernels<double>::axpy(d_pw, alpha, d_x, N);      // x = x + alpha * pw
-        norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
+        OpenclKernels<Scalar>::axpy(d_v, -alpha, d_r, N);      // r = r - alpha * v
+        OpenclKernels<Scalar>::axpy(d_pw, alpha, d_x, N);      // x = x + alpha * pw
+        norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
        if (verbosity >= 3) {
            queue->finish();
            t_rest.stop();
@ -360,8 +359,8 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        }

        // t = A * s
-        OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
-        if(verbosity >= 3){
+        OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
+        if (verbosity >= 3) {
            queue->finish();
            t_spmv.stop();
            t_well.start();
@ -369,7 +368,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        // apply wellContributions
        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_s, d_t);
        }
        if (verbosity >= 3) {
            queue->finish();
@ -377,12 +376,12 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
            t_rest.start();
        }

-        tmp1 = OpenclKernels<double>::dot(d_t, d_r, d_tmp, N);
-        tmp2 = OpenclKernels<double>::dot(d_t, d_t, d_tmp, N);
+        tmp1 = OpenclKernels<Scalar>::dot(d_t, d_r, d_tmp, N);
+        tmp2 = OpenclKernels<Scalar>::dot(d_t, d_t, d_tmp, N);
        omega = tmp1 / tmp2;
-        OpenclKernels<double>::axpy(d_s, omega, d_x, N);     // x = x + omega * s
-        OpenclKernels<double>::axpy(d_t, -omega, d_r, N);    // r = r - omega * t
-        norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
+        OpenclKernels<Scalar>::axpy(d_s, omega, d_x, N);     // x = x + omega * s
+        OpenclKernels<Scalar>::axpy(d_t, -omega, d_r, N);    // r = r - omega * t
+        norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
        if (verbosity >= 3) {
            queue->finish();
            t_rest.stop();
@ -399,7 +398,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        }
    }

-    res.iterations = std::min(it, (float)maxit);
+    res.iterations = std::min(it, static_cast<float>(maxit));
    res.reduction = norm / norm_0;
    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / it));
    res.elapsed = t_total.stop();
@ -407,7 +406,8 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

    if (verbosity > 0) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+        out << "=== converged: " << res.converged << ", conv_rate: "
+            << res.conv_rate << ", time: " << res.elapsed <<
            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
    }
@ -422,10 +422,10 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
    }
 }

-template <unsigned int block_size>
-void openclSolverBackend<block_size>::
-initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
@ -456,22 +456,21 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
        mat = matrix;
        jacMat = jacMatrix;

-        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);

-        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
+        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * nnz);
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
-
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -486,8 +485,10 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
    initialized = true;
 } // end initialize()

-template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu() {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+copy_system_to_gpu()
+{
    Timer t;
    events.resize(5);

@ -495,18 +496,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
-        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, vals_contiguous.data(),
+                                    nullptr, &events[0]);
 #else
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
 #endif

-    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
-    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
-    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
-    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
+    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0,
+                                     sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0,
+                                     sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
+                                     sizeof(Scalar) * N, h_b, nullptr, &events[3]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);

    cl::WaitForEvents(events);
    events.clear();
@ -523,8 +531,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
 } // end copy_system_to_gpu()

 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system_on_gpu() {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+update_system_on_gpu()
+{
    Timer t;
    events.resize(3);

@ -532,16 +542,21 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
-        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, vals_contiguous.data(),
+                                    nullptr, &events[0]);
 #else
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
 #endif

-    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
-    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
+                                     sizeof(Scalar) * N, h_b, nullptr, &events[1]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);

    cl::WaitForEvents(events);
    events.clear();
@ -557,9 +572,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
    }
 } // end update_system_on_gpu()

-
-template <unsigned int block_size>
-bool openclSolverBackend<block_size>::analyze_matrix() {
+template<class Scalar, unsigned int block_size>
+bool openclSolverBackend<Scalar,block_size>::
+analyze_matrix()
+{
    Timer t;

    bool success;
@ -579,9 +595,10 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
    return success;
 } // end analyze_matrix()

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+update_system(Scalar* vals, Scalar* b)
+{
    Timer t;

    mat->nnzValues = vals;
@ -594,9 +611,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
    }
 } // end update_system()

-
-template <unsigned int block_size>
-bool openclSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool openclSolverBackend<Scalar,block_size>::
+create_preconditioner()
+{
    Timer t;

    bool result;
@ -613,10 +631,9 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
    return result;
 } // end create_preconditioner()

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::
-solve_system(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
    Timer t;

@ -625,7 +642,8 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
        gpu_pbicgstab(wellContribs, res);
    } catch (const cl::Error& error) {
        std::ostringstream oss;
-        oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
+        oss << "openclSolverBackend::solve_system error: " << error.what()
+            << "(" << error.err() << ")\n";
        oss << getErrorString(error.err());
        // rethrow exception
        OPM_THROW(std::logic_error, oss.str());
@ -641,14 +659,15 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
    }
 } // end solve_system()

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+get_result(Scalar* x)
+{
    Timer t;

-    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
+    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(Scalar) * N, x);

    if (verbosity > 2) {
        std::ostringstream out;
@ -657,12 +676,12 @@ void openclSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-template <unsigned int block_size>
-SolverStatus openclSolverBackend<block_size>::
-solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-             double *b,
-             std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-             WellContributions<double>& wellContribs,
+template<class Scalar, unsigned int block_size>
+SolverStatus openclSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
             BdaResult& res)
 {
    if (initialized == false) {
@ -688,21 +707,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

+#define INSTANTIATE_TYPE(T)                  \
+    template class openclSolverBackend<T,1>; \
+    template class openclSolverBackend<T,2>; \
+    template class openclSolverBackend<T,3>; \
+    template class openclSolverBackend<T,4>; \
+    template class openclSolverBackend<T,5>; \
+    template class openclSolverBackend<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
-template openclSolverBackend<n>::openclSolverBackend(                       \
-    int, int, double, unsigned int, unsigned int, bool, std::string); \
-template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
-template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@ -27,16 +27,13 @@

 #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a opencl-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class openclSolverBackend : public BdaSolver<double,block_size>
+template<class Scalar, unsigned int block_size>
+class openclSolverBackend : public BdaSolver<Scalar,block_size>
 {
-    using Base = BdaSolver<double,block_size>;
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -50,8 +47,8 @@ class openclSolverBackend : public BdaSolver<double,block_size>
    using Base::initialized;

 private:
-    double *h_b = nullptr;                // b vector, on host
-    std::vector<double> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
+    Scalar* h_b = nullptr;                // b vector, on host
+    std::vector<Scalar> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp

    // OpenCL variables must be reusable, they are initialized in initialize()
    cl::Buffer d_Avals, d_Acols, d_Arows;        // matrix in BSR format on GPU
@ -63,12 +60,12 @@ private:

    bool useJacMatrix = false;

-    std::unique_ptr<Preconditioner<double,block_size> > prec;
+    std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
                                                                  // can perform blocked ILU0 and AMG on pressure component
    bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
    bool analysis_done = false;
-    std::shared_ptr<BlockedMatrix<double>> mat{};                 // original matrix
-    std::shared_ptr<BlockedMatrix<double>> jacMat{};              // matrix for preconditioner
+    std::shared_ptr<BlockedMatrix<Scalar>> mat{};                 // original matrix
+    std::shared_ptr<BlockedMatrix<Scalar>> jacMat{};              // matrix for preconditioner
    bool opencl_ilu_parallel;                                     // parallelize ILU operations (with level_scheduling)
    std::vector<cl::Event> events;
    cl_int err;
@ -76,13 +73,13 @@ private:
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
    /// \param[in] matrix     matrix A
    /// \param[in] jacMatrix  matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Copy linear system to GPU
    void copy_system_to_gpu();
@ -90,7 +87,7 @@ private:
    /// Reassign pointers, in case the addresses of the Dune variables have changed
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] b              input vector b, contains N values
-    void update_system(double *vals, double *b);
+    void update_system(Scalar* vals, Scalar* b);

    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    void update_system_on_gpu();
@ -107,11 +104,11 @@ private:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    ///                           could be empty
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions<double>& wellContribs, BdaResult& res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);

 public:
-    std::shared_ptr<cl::Context> context;
-    std::shared_ptr<cl::CommandQueue> queue;
+    std::shared_ptr<cl::Context> context{};
+    std::shared_ptr<cl::CommandQueue> queue{};

    /// Construct a openclSolver
    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
@ -122,11 +119,13 @@ public:
    /// \param[in] opencl_ilu_parallel        whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
    /// \param[in] linsolver                  indicating the preconditioner, equal to the --linear-solver cmdline argument
    ///                                       only ilu0, cpr_quasiimpes and isai are supported
-    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
-        bool opencl_ilu_parallel, std::string linsolver);
+    openclSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance,
+                        unsigned int platformID, unsigned int deviceID,
+                        bool opencl_ilu_parallel, std::string linsolver);

    /// For the CPR coarse solver
-    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
+    openclSolverBackend(int linear_solver_verbosity, int maxit,
+                        Scalar tolerance, bool opencl_ilu_parallel);

    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
    /// \param[in] matrix         matrix A
@ -135,10 +134,10 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-                              double *b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-                              WellContributions<double>& wellContribs,
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
                              BdaResult& res) override;

    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
@ -147,19 +146,16 @@ public:

    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;

    /// Set OpenCL objects
    /// This class either creates them based on platformID and deviceID or receives them through this function
    /// \param[in] context   the opencl context to be used
    /// \param[in] queue     the opencl queue to be used
-    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
-    
+    void setOpencl(std::shared_ptr<cl::Context>& context,
+                   std::shared_ptr<cl::CommandQueue>& queue);
 }; // end class openclSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
-
-