BILU0: template Scalar type

2024-07-07 04:53:03 -05:00 · 2024-04-15 22:38:04 +02:00 · 2024-04-15 22:38:04 +02:00 · 8ea523fa68
commit 8ea523fa68
parent b75ea188ee
7 changed files with 95 additions and 84 deletions
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@ -35,8 +35,8 @@ namespace Opm::Accelerator {

 using Dune::Timer;

-template <unsigned int block_size>
-BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
+template<class Scalar, unsigned int block_size>
+BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
    : Base(verbosity_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
@ -45,17 +45,15 @@ BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
 #endif
 }

-
-template <unsigned int block_size>
-bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }

-
-template <unsigned int block_size>
-bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat,
-                                       BlockedMatrix<double>* jacMat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;

@ -75,30 +73,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat,
        CSCRowIndices.resize(matToDecompose->nnzbs);
        CSCColPointers.resize(Nb + 1);

-        LUmat = std::make_unique<BlockedMatrix<double>>(*matToDecompose);
+        LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);

        Timer t_convert;
-        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
+        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers,
+                        CSCRowIndices.data(), CSCColPointers.data(), Nb);
        if(verbosity >= 3){
            std::ostringstream out;
            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
            OpmLog::info(out.str());
        }
    } else {
-        LUmat = std::make_unique<BlockedMatrix<double>>(*matToDecompose);
+        LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);
    }

    Timer t_analysis;
    std::ostringstream out;
    if (opencl_ilu_parallel) {
        out << "opencl_ilu_parallel: true (level_scheduling)\n";
-        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers,
+                            CSCRowIndices.data(), CSCColPointers.data(), Nb,
+                            &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
    } else {
        out << "opencl_ilu_parallel: false\n";
        // numColors = 1;
        // rowsPerColor.emplace_back(Nb);
        numColors = Nb;
-        for(int i = 0; i < Nb; ++i){
+        for (int i = 0; i < Nb; ++i) {
            rowsPerColor.emplace_back(1);
        }
    }
@ -116,44 +117,52 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat,
    invDiagVals.resize(mat->Nb * bs * bs);

 #if CHOW_PATEL
-    Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
-    Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
+    Lmat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
+    Umat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
 #endif

-    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
+    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * mat->Nb);
    s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
    s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
    s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
 #if CHOW_PATEL
-    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
    s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
    s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
-    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
    s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
    s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
 #else
-    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * LUmat->nnzbs);
    s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
    s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
 #endif

    events.resize(3);
-    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0,
+                                    mat->Nb * sizeof(Scalar) * bs * bs,
+                                    invDiagVals.data(), nullptr, &events[0]);

    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
    for (int i = 0; i < numColors; ++i) {
        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
    }

-    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0,
+                                     (numColors + 1) * sizeof(int),
+                                     rowsPerColorPrefix.data(), nullptr, &events[1]);

    if (opencl_ilu_parallel) {
-        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
+                                         Nb * sizeof(unsigned), fromOrder.data(),
+                                         nullptr, &events[2]);
    } else {
        // fromOrder is not initialized, so use something else to fill s.rowIndices
        // s.rowIndices[i] == i must hold, since every rowidx is mapped to itself (i.e. no actual mapping)
        // rowsPerColorPrefix is misused here, it contains an increasing sequence (0, 1, 2, ...)
-        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), rowsPerColorPrefix.data(), nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
+                                         Nb * sizeof(unsigned),
+                                         rowsPerColorPrefix.data(), nullptr, &events[2]);
    }

    cl::WaitForEvents(events);
@ -166,16 +175,15 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat,
    return true;
 }

-template <unsigned int block_size>
-bool BILU0<block_size>::create_preconditioner(BlockedMatrix<double>* mat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }

-template <unsigned int block_size>
-bool BILU0<block_size>::
-create_preconditioner(BlockedMatrix<double>* mat,
-                      BlockedMatrix<double>* jacMat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;

@ -183,7 +191,8 @@ create_preconditioner(BlockedMatrix<double>* mat,

    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    Timer t_copy;
-    memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
+    memcpy(LUmat->nnzValues, matToDecompose->nnzValues,
+           sizeof(Scalar) * bs * bs * matToDecompose->nnzbs);

    if (verbosity >= 3){
        std::ostringstream out;
@ -202,7 +211,9 @@ create_preconditioner(BlockedMatrix<double>* mat,
    Timer t_copyToGpu;

    events.resize(1);
-    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0,
+                              LUmat->nnzbs * bs * bs * sizeof(Scalar),
+                              LUmat->nnzValues, nullptr, &events[0]);

    std::call_once(pattern_uploaded, [&](){
        // find the positions of each diagonal block
@ -210,14 +221,18 @@ create_preconditioner(BlockedMatrix<double>* mat,
            int rowStart = LUmat->rowPointers[row];
            int rowEnd = LUmat->rowPointers[row+1];

-            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+            auto candidate = std::find(LUmat->colIndices + rowStart,
+                                       LUmat->colIndices + rowEnd, row);
            assert(candidate != LUmat->colIndices + rowEnd);
            diagIndex[row] = candidate - LUmat->colIndices;
        }
        events.resize(4);
-        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
-        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
-        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int),
+                                  diagIndex.data(), nullptr, &events[1]);
+        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int),
+                                  LUmat->colIndices, nullptr, &events[2]);
+        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int),
+                                  LUmat->rowPointers, nullptr, &events[3]);
    });

    cl::WaitForEvents(events);
@ -239,9 +254,10 @@ create_preconditioner(BlockedMatrix<double>* mat,
        const unsigned int firstRow = rowsPerColorPrefix[color];
        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
        if (verbosity >= 5) {
-            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+            out << "color " << color << ": " << firstRow << " - " << lastRow
+                << " = " << lastRow - firstRow << "\n";
        }
-        OpenclKernels<double>::ILU_decomp(firstRow, lastRow, s.rowIndices,
+        OpenclKernels<Scalar>::ILU_decomp(firstRow, lastRow, s.rowIndices,
                                          s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
                                          s.invDiagVals, rowsPerColor[color], block_size);
    }
@ -256,24 +272,23 @@ create_preconditioner(BlockedMatrix<double>* mat,
    return true;
 } // end create_preconditioner()

-
 // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
-template <unsigned int block_size>
-void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+template<class Scalar, unsigned int block_size>
+void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
 {
-    const double relaxation = 0.9;
+    const Scalar relaxation = 0.9;
    cl::Event event;
    Timer t_apply;

    for (int color = 0; color < numColors; ++color) {
 #if CHOW_PATEL
-        OpenclKernels<double>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
+        OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
                                          s.diagIndex, y, x, s.rowsPerColor,
                                          color, rowsPerColor[color], block_size);
 #else
-        OpenclKernels<double>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
+        OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
                                          s.diagIndex, y, x, s.rowsPerColor,
                                          color, rowsPerColor[color], block_size);
 #endif
@ -281,18 +296,18 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)

    for (int color = numColors - 1; color >= 0; --color) {
 #if CHOW_PATEL
-        OpenclKernels<double>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
-                                         s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
-                                         color, rowsPerColor[color], block_size);
+        OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
+                                          s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
+                                          color, rowsPerColor[color], block_size);
 #else
-        OpenclKernels<double>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
+        OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
                                          s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
                                          color, rowsPerColor[color], block_size);
 #endif
    }

    // apply relaxation
-    OpenclKernels<double>::scale(x, relaxation, N);
+    OpenclKernels<Scalar>::scale(x, relaxation, N);

    if (verbosity >= 4) {
        std::ostringstream out;
@ -301,19 +316,14 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
    }
 }

+#define INSTANCE_TYPE(T)       \
+    template class BILU0<T,1>; \
+    template class BILU0<T,2>; \
+    template class BILU0<T,3>; \
+    template class BILU0<T,4>; \
+    template class BILU0<T,5>; \
+    template class BILU0<T,6>;

-
-#define INSTANTIATE_BDA_FUNCTIONS(n) \
-template class BILU0<n>;
-
-
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
+INSTANCE_TYPE(double)

 } // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@ -34,10 +34,10 @@ namespace Opm::Accelerator {
 /// This class implements a Blocked ILU0 preconditioner
 /// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
 /// The preconditioner is applied via two exact triangular solves
-template <unsigned int block_size>
-class BILU0 : public Preconditioner<double,block_size>
+template<class Scalar, unsigned int block_size>
+class BILU0 : public Preconditioner<Scalar,block_size>
 {
-    using Base = Preconditioner<double,block_size>;
+    using Base = Preconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -50,11 +50,11 @@ class BILU0 : public Preconditioner<double,block_size>
    using Base::err;

 private:
-    std::unique_ptr<BlockedMatrix<double>> LUmat{};
+    std::unique_ptr<BlockedMatrix<Scalar>> LUmat{};
 #if CHOW_PATEL
-    std::unique_ptr<BlockedMatrix<double>> Lmat{}, Umat{};
+    std::unique_ptr<BlockedMatrix<Scalar>> Lmat{}, Umat{};
 #endif
-    std::vector<double> invDiagVals;
+    std::vector<Scalar> invDiagVals;
    std::vector<int> diagIndex;
    std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
    std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
@ -64,7 +64,7 @@ private:

    bool opencl_ilu_parallel;

-    typedef struct {
+    struct GPU_storage {
        cl::Buffer invDiagVals;    // nnz values of diagonal blocks of the matrix, inverted
        cl::Buffer diagIndex;      // index of diagonal block of each row, used to differentiate between lower and upper triangular part
        cl::Buffer rowsPerColor;   // number of rows for every color
@ -77,7 +77,7 @@ private:
 #else
        cl::Buffer LUvals, LUcols, LUrows;
 #endif
-    } GPU_storage;
+    };

    GPU_storage s;

@ -90,23 +90,25 @@ public:
    BILU0(bool opencl_ilu_parallel, int verbosity);

    // analysis, extract parallelism if specified
-    bool analyze_matrix(BlockedMatrix<double>* mat) override;
-    bool analyze_matrix(BlockedMatrix<double>* mat,
-                        BlockedMatrix<double>* jacMat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat,
+                        BlockedMatrix<Scalar>* jacMat) override;

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix<double>* mat) override;
-    bool create_preconditioner(BlockedMatrix<double>* mat,
-                               BlockedMatrix<double>* jacMat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat,
+                               BlockedMatrix<Scalar>* jacMat) override;

    // apply preconditioner, x = prec(y)
    // via Lz = y
    // and Ux = z
    void apply(const cl::Buffer& y, cl::Buffer& x) override;

-    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
+    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
+    get_preconditioner_structure()
    {
-        return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
+        return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)},
+                {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
    }

    std::pair<cl::Buffer, cl::Buffer> get_preconditioner_data()
@ -122,4 +124,3 @@ public:
 } // namespace Opm::Accelerator

 #endif
-
--- a/opm/simulators/linalg/bda/opencl/BISAI.cpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp
@ -46,7 +46,7 @@ BISAI<block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_)
 #if CHOW_PATEL
    OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
 #endif
-    bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel_, verbosity_);
+    bilu0 = std::make_unique<BILU0<double,block_size>>(opencl_ilu_parallel_, verbosity_);
 }

 template <unsigned int block_size>
--- a/opm/simulators/linalg/bda/opencl/BISAI.hpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.hpp
@ -68,7 +68,7 @@ private:
    cl::Buffer d_invL_x;

    bool opencl_ilu_parallel;
-    std::unique_ptr<BILU0<block_size> > bilu0;
+    std::unique_ptr<BILU0<double,block_size>> bilu0;

    /// Struct that holds the structure of the small subsystems for each column
    typedef struct{
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@ -45,7 +45,7 @@ CPR<block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
    : Base(verbosity_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
-    bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
+    bilu0 = std::make_unique<BILU0<double,block_size> >(opencl_ilu_parallel, verbosity_);
    diagIndices.resize(1);
 }

--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@ -69,7 +69,7 @@ private:
    std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once

-    std::unique_ptr<BILU0<block_size> > bilu0;                    // Blocked ILU0 preconditioner
+    std::unique_ptr<BILU0<double,block_size>> bilu0;                    // Blocked ILU0 preconditioner
    BlockedMatrix<double>* mat = nullptr;    // input matrix, blocked

    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@ -47,7 +47,7 @@ Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, i
 {
    switch (type ) {
    case Type::BILU0:
-        return std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity);
+        return std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity);
    case Type::CPR:
        return std::make_unique<CPR<block_size> >(opencl_ilu_parallel, verbosity);
    case Type::BISAI: