CPR: template Scalar type

2025-02-25 18:55:30 -06:00 · 2024-04-15 22:38:04 +02:00 · 2024-04-15 22:38:04 +02:00 · 7e1f4629ed
commit 7e1f4629ed
parent 1f39e6a9a9
3 changed files with 158 additions and 149 deletions
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@ -34,33 +34,31 @@
 #include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
 #include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>

-
 namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-CPR<block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
+template<class Scalar, unsigned int block_size>
+CPR<Scalar,block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
    : Base(verbosity_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
-    bilu0 = std::make_unique<BILU0<double,block_size> >(opencl_ilu_parallel, verbosity_);
+    bilu0 = std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
    diagIndices.resize(1);
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
-
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
+{
    context = context_;
    queue = queue_;

    bilu0->setOpencl(context, queue);
 }

-template <unsigned int block_size>
-bool CPR<block_size>::analyze_matrix(BlockedMatrix<double>* mat_)
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat_)
 {
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
@ -72,9 +70,9 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix<double>* mat_)
    return success;
 }

-template <unsigned int block_size>
-bool CPR<block_size>::analyze_matrix(BlockedMatrix<double>* mat_,
-                                     BlockedMatrix<double>* jacMat)
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
 {
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
@ -87,10 +85,9 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix<double>* mat_,
    return success;
 }

-template <unsigned int block_size>
-bool CPR<block_size>::
-    create_preconditioner(BlockedMatrix<double>* mat_,
-                          BlockedMatrix<double>* jacMat)
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
 {
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_, jacMat);
@ -110,8 +107,9 @@ bool CPR<block_size>::
    return result;
 }

-template <unsigned int block_size>
-bool CPR<block_size>::create_preconditioner(BlockedMatrix<double>* mat_)
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat_)
 {
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_);
@ -131,25 +129,29 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix<double>* mat_)
    return result;
 }

-
 // return the absolute value of the N elements for which the absolute value is highest
-double get_absmax(const double *data, const int N) {
-    return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
+template<class Scalar>
+Scalar get_absmax(const Scalar* data, const int N)
+{
+    return std::abs(*std::max_element(data, data + N,
+                                      [](Scalar a, Scalar b)
+                                      { return std::fabs(a) < std::fabs(b); }));
 }

-
 // solve A^T * x = b
-void solve_transposed_3x3(const double *A, const double *b, double *x) {
+template<class Scalar>
+void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
+{
    const int B = 3;
    // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
-    double t4  = A[0+0*B] * A[1+1*B];
-    double t6  = A[0+0*B] * A[1+2*B];
-    double t8  = A[0+1*B] * A[1+0*B];
-    double t10 = A[0+2*B] * A[1+0*B];
-    double t12 = A[0+1*B] * A[2+0*B];
-    double t14 = A[0+2*B] * A[2+0*B];
+    Scalar t4  = A[0+0*B] * A[1+1*B];
+    Scalar t6  = A[0+0*B] * A[1+2*B];
+    Scalar t8  = A[0+1*B] * A[1+0*B];
+    Scalar t10 = A[0+2*B] * A[1+0*B];
+    Scalar t12 = A[0+1*B] * A[2+0*B];
+    Scalar t14 = A[0+2*B] * A[2+0*B];

-    double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
+    Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
               t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant

    x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
@ -165,44 +167,49 @@ void solve_transposed_3x3(const double *A, const double *b, double *x) {
          + A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::init_opencl_buffers() {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar, block_size>::init_opencl_buffers()
+{
    d_Amatrices.reserve(num_levels);
    d_Rmatrices.reserve(num_levels - 1);
    d_invDiags.reserve(num_levels - 1);
-    for (Matrix<double>& m : Amatrices) {
+    for (Matrix<Scalar>& m : Amatrices) {
        d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
    }
-    for (Matrix<double>& m : Rmatrices) {
+    for (Matrix<Scalar>& m : Rmatrices) {
        d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
-        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
-        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
+        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
+        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);

        d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
-        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
-        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
+        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
+        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
    }
-    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-    d_mat = std::make_unique<OpenclMatrix<double>>(context.get(), Nb, Nb, nnzb, block_size);
-    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
-    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
+    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+    d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), Nb, Nb, nnzb, block_size);
+    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
+    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::opencl_upload() {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::opencl_upload()
+{
    d_mat->upload(queue.get(), mat);

    err = CL_SUCCESS;
    events.resize(2 * Rmatrices.size() + 1);
-    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
+    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
+                                     sizeof(Scalar) * N, weights.data(), nullptr, &events[0]);
    for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
        d_Amatrices[i].upload(queue.get(), &Amatrices[i]);

-        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
-        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
+        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
+                                         sizeof(Scalar) * Amatrices[i].N, invDiags[i].data(),
+                                         nullptr, &events[2*i+1]);
+        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
+                                         sizeof(int) * Amatrices[i].N, PcolIndices[i].data(),
+                                         nullptr, &events[2*i+2]);
    }
    cl::WaitForEvents(events);
    events.clear();
@ -215,8 +222,9 @@ void CPR<block_size>::opencl_upload() {
    }
 }

-template <unsigned int block_size>
-void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::
+create_preconditioner_amg(BlockedMatrix<Scalar>* mat_)
 {
    this->mat = mat_;

@ -226,7 +234,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
    weights.resize(N);

    try {
-        double rhs[] = {0, 0, 0};
+        Scalar rhs[] = {0, 0, 0};
        rhs[pressure_idx] = 1;

        // find diagonal index for each row
@ -244,11 +252,11 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
        // calculate weights for each row
        for (int row = 0; row < Nb; ++row) {
            // solve to find weights
-            double *row_weights = weights.data() + block_size * row; // weights for this row
+            Scalar* row_weights = weights.data() + block_size * row; // weights for this row
            solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);

            // normalize weights for this row
-            double abs_max = get_absmax(row_weights, block_size);
+            Scalar abs_max = get_absmax(row_weights, block_size);
            for (unsigned int i = 0; i < block_size; i++) {
                row_weights[i] /= abs_max;
            }
@ -260,9 +268,9 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
            int start = mat->rowPointers[row];
            int end = mat->rowPointers[row + 1];
            for (int idx = start; idx < end; ++idx) {
-                double *block = mat->nnzValues + idx * block_size * block_size;
-                double *row_weights = weights.data() + block_size * row;
-                double value = 0.0;
+                Scalar* block = mat->nnzValues + idx * block_size * block_size;
+                Scalar* row_weights = weights.data() + block_size * row;
+                Scalar value = 0.0;
                for (unsigned int i = 0; i < block_size; ++i) {
                    value += block[block_size * i + pressure_idx] * row_weights[i];
                }
@ -279,7 +287,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
        if (recalculate_aggregates) {
            dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);

-            typedef DuneMat::CreateIterator Iter;
+            using Iter = typename DuneMat::CreateIterator;

            // setup sparsity pattern
            for (Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row) {
@ -305,7 +313,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
            Dune::Amg::SequentialInformation seqinfo;
            dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));

-            Opm::PropertyTree property_tree;
+            PropertyTree property_tree;
            property_tree.put("alpha", 0.333333333333);

            // The matrix has a symmetric sparsity pattern, but the values are not symmetric
@ -318,7 +326,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
            num_pre_smooth_steps = c.getNoPreSmoothSteps();
            num_post_smooth_steps = c.getNoPostSmoothSteps();

-            dune_amg->build<OverlapFlags>(c);
+            dune_amg->template build<OverlapFlags>(c);

            analyzeHierarchy();
            analyzeAggregateMaps();
@ -354,10 +362,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::analyzeHierarchy() {
-    const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::analyzeHierarchy()
+{
+    const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();

    // store coarsest AMG level in umfpack format, also performs LU decomposition
    umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
@ -375,7 +383,7 @@ void CPR<block_size>::analyzeHierarchy() {

    // matrixIter.dereference() returns MatrixAdapter
    // matrixIter.dereference().getmat() returns BCRSMat
-    DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
+    typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
    for (int level = 0; level < num_levels; ++matrixIter, ++level) {
        const auto& A = matrixIter.dereference().getmat();
        level_sizes[level] = A.N();
@ -398,28 +406,28 @@ void CPR<block_size>::analyzeHierarchy() {
            }
        }

-        Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
+        BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers,
+                                                                    Amatrices.back().colIndices);

        // compute inverse diagonal values for current level
        invDiags.emplace_back(A.N());
        for (unsigned int row = 0; row < A.N(); ++row) {
-            invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
+            invDiags.back()[row] = 1.0 / Amatrices.back().nnzValues[diagIndices[level][row]];
        }
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::analyzeAggregateMaps() {
-
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::analyzeAggregateMaps()
+{
    PcolIndices.resize(num_levels - 1);
    Rmatrices.clear();

-    const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
+    const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();

-    DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
+    typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
    for (int level = 0; level < num_levels - 1; ++mapIter, ++level) {
-        DuneAmg::AggregatesMap *map = *mapIter;
+        typename DuneAmg::AggregatesMap* map = *mapIter;

        Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
        std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
@ -428,7 +436,7 @@ void CPR<block_size>::analyzeAggregateMaps() {
        std::vector<std::vector<unsigned>> indicesR(level_sizes[level+1]);
        PcolIndices[level].resize(level_sizes[level]);

-        using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
+        using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
        for (AggregateIterator ai = map->begin(); ai != map->end(); ++ai) {
            if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
                const long int diff = ai - map->begin();
@ -449,20 +457,20 @@ void CPR<block_size>::analyzeAggregateMaps() {
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
 {
-    OpenclMatrix<double>* A = &d_Amatrices[level];
-    OpenclMatrix<double>* R = &d_Rmatrices[level];
+    OpenclMatrix<Scalar>* A = &d_Amatrices[level];
+    OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
    int Ncur = A->Nb;

    if (level == num_levels - 1) {
        // solve coarsest level
-        std::vector<double> h_y(Ncur), h_x(Ncur, 0);
+        std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);

        events.resize(1);
-        err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
+        err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
+                                       sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
        cl::WaitForEvents(events);
        events.clear();
        if (err != CL_SUCCESS) {
@ -474,7 +482,8 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer&
        umfpack.apply(h_x.data(), h_y.data());

        events.resize(1);
-        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
+        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
+                                        sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
        cl::WaitForEvents(events);
        events.clear();
        if (err != CL_SUCCESS) {
@ -490,35 +499,37 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer&
    cl::Buffer& u = d_u[level]; // u was 0-initialized earlier

    // presmooth
-    double jacobi_damping = 0.65; // default value in amgcl: 0.72
+    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
    for (unsigned i = 0; i < num_pre_smooth_steps; ++i) {
-        OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-        OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
    }

    // move to coarser level
-    OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-    OpenclKernels<double>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
+    OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+    OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
    amg_cycle_gpu(level + 1, f, u);
-    OpenclKernels<double>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
+    OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);

    // postsmooth
    for (unsigned i = 0; i < num_post_smooth_steps; ++i) {
-        OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers,
+        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
                                        x, y, t, Ncur, 1);
-        OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
    }
 }

-
 // x = prec(y)
-template <unsigned int block_size>
-void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
+{
    // 0-initialize u and x vectors
    events.resize(d_u.size() + 1);
-    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
+    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
+                                   sizeof(Scalar) * Nb, nullptr, &events[0]);
    for (unsigned int i = 0; i < d_u.size(); ++i) {
-        err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
+        err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
+                                        sizeof(Scalar) * Rmatrices[i].N, nullptr, &events[i + 1]);
    }
    cl::WaitForEvents(events);
    events.clear();
@ -527,17 +538,18 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
    }

-    OpenclKernels<double>::residual(d_mat->nnzValues, d_mat->colIndices,
+    OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
                                    d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
-    OpenclKernels<double>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
+    OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);

    amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);

-    OpenclKernels<double>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
+    OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
 }

-template <unsigned int block_size>
-void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+{
    Dune::Timer t_bilu0;
    bilu0->apply(y, x);
    if (verbosity >= 4) {
@ -555,19 +567,14 @@ void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
    }
 }

+#define INSTANCE_TYPE(T)     \
+    template class CPR<T,1>; \
+    template class CPR<T,2>; \
+    template class CPR<T,3>; \
+    template class CPR<T,4>; \
+    template class CPR<T,5>; \
+    template class CPR<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template class CPR<n>;
-
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
+INSTANCE_TYPE(double)

 } // namespace Opm::Accelerator
-
-
--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@ -38,10 +38,10 @@ namespace Opm::Accelerator {
 template<class Scalar> class BlockedMatrix;

 /// This class implements a Constrained Pressure Residual (CPR) preconditioner
-template <unsigned int block_size>
-class CPR : public Preconditioner<double,block_size>
+template<class Scalar, unsigned int block_size>
+class CPR : public Preconditioner<Scalar,block_size>
 {
-    using Base = Preconditioner<double,block_size>;
+    using Base = Preconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -55,25 +55,25 @@ class CPR : public Preconditioner<double,block_size>

 private:
    int num_levels;
-    std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
-    std::vector<Matrix<double>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
-    std::vector<OpenclMatrix<double>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
+    std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<OpenclMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
    std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
    std::vector<cl::Buffer> d_PcolIndices;
-    std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
+    std::vector<std::vector<Scalar>> invDiags; // inverse of diagonal of Amatrices
    std::vector<cl::Buffer> d_invDiags;
    std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
    std::unique_ptr<cl::Buffer> d_rs;      // use before extracting the pressure
    std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
-    std::unique_ptr<OpenclMatrix<double>> d_mat;   // stores blocked matrix
+    std::unique_ptr<OpenclMatrix<Scalar>> d_mat;   // stores blocked matrix
    std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once

-    std::unique_ptr<BILU0<double,block_size>> bilu0;                    // Blocked ILU0 preconditioner
-    BlockedMatrix<double>* mat = nullptr;    // input matrix, blocked
+    std::unique_ptr<BILU0<Scalar,block_size>> bilu0;                    // Blocked ILU0 preconditioner
+    BlockedMatrix<Scalar>* mat = nullptr;    // input matrix, blocked

-    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
-    using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
+    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
+    using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
    using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
    using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
    std::unique_ptr<DuneAmg> dune_amg;
@ -109,31 +109,33 @@ private:

    void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);

-    void create_preconditioner_amg(BlockedMatrix<double>* mat);
+    void create_preconditioner_amg(BlockedMatrix<Scalar>* mat);

 public:
    CPR(bool opencl_ilu_parallel, int verbosity);

-    bool analyze_matrix(BlockedMatrix<double>* mat) override;
-    bool analyze_matrix(BlockedMatrix<double>* mat,
-                        BlockedMatrix<double>* jacMat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat,
+                        BlockedMatrix<Scalar>* jacMat) override;

    // set own Opencl variables, but also that of the bilu0 preconditioner
-    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
+    void setOpencl(std::shared_ptr<cl::Context>& context,
+                   std::shared_ptr<cl::CommandQueue>& queue) override;

    // applies blocked ilu0
    // also applies amg for pressure component
    void apply(const cl::Buffer& y, cl::Buffer& x) override;

-    bool create_preconditioner(BlockedMatrix<double>* mat) override;
-    bool create_preconditioner(BlockedMatrix<double>* mat,
-                               BlockedMatrix<double>* jacMat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat,
+                               BlockedMatrix<Scalar>* jacMat) override;
 };

 // solve A^T * x = b
 // A should represent a 3x3 matrix
 // x and b are vectors with 3 elements
-void solve_transposed_3x3(const double *A, const double *b, double *x);
+template<class Scalar>
+void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x);

 } // namespace Opm::Accelerator

--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@ -49,7 +49,7 @@ Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, i
    case Type::BILU0:
        return std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
    case Type::CPR:
-        return std::make_unique<CPR<block_size> >(opencl_ilu_parallel, verbosity);
+        return std::make_unique<CPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
    case Type::BISAI:
        return std::make_unique<BISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
    }