BlockMatrix: template Scalar type

2024-07-07 04:53:03 -05:00 · 2024-04-15 16:13:22 +02:00 · 2024-04-15 16:13:22 +02:00 · 25374b0e54
commit 25374b0e54
parent b9ee637d78
25 changed files with 242 additions and 183 deletions
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -221,12 +221,16 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
            return;
        }

+        using Mat = Accelerator::BlockedMatrix<double>;
        if (!matrix) {
            h_rows.reserve(Nb+1);
            h_cols.reserve(nnzb);
            copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
            checkMemoryContiguous(*bridgeMat);
-            matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
+            matrix = std::make_unique<Mat>(Nb, nnzb, block_size,
+                                           static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))),
+                                           h_cols.data(),
+                                           h_rows.data());
        }

        Dune::Timer t_zeros;
@ -245,7 +249,10 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
                h_jacCols.reserve(jacNnzb);
                copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
                checkMemoryContiguous(*jacMat);
-                jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
+                jacMatrix = std::make_unique<Mat>(Nb, jacNnzb, block_size,
+                                                  static_cast<double*>(&(((*jacMat)[0][0][0][0]))),
+                                                  h_jacCols.data(),
+                                                  h_jacRows.data());
            }

            Dune::Timer t_zeros2;
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -39,9 +39,9 @@ private:
    int verbosity = 0;
    bool use_gpu = false;
    std::string accelerator_mode;
-    std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
-    std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
-    std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::unique_ptr<Accelerator::BdaSolver<block_size> > backend;
+    std::shared_ptr<Accelerator::BlockedMatrix<double>> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::shared_ptr<Accelerator::BlockedMatrix<double>> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
    std::vector<int> h_rows, h_cols;  // store the sparsity pattern of the matrix
    std::vector<int> h_jacRows, h_jacCols;  // store the sparsity pattern of the jacMatrix
    std::vector<typename BridgeMatrix::size_type> diagIndices;   // contains offsets of the diagonal blocks wrt start of the row, used for replaceZeroDiagonal()
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -83,8 +83,10 @@ namespace Accelerator {
        virtual ~BdaSolver() {};

        /// Define as pure virtual functions, so derivedclass must implement them
-        virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-            std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
+        virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                                          double *b,
+                                          std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                                          WellContributions& wellContribs, BdaResult& res) = 0;

        virtual void get_result(double *x) = 0;

--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -17,9 +17,6 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <cstring>
-#include <cmath>
-
 #include <config.h>

 #include <opm/common/OpmLog/OpmLog.hpp>
@ -29,16 +26,10 @@
 #include <opm/simulators/linalg/bda/Matrix.hpp>
 #include <opm/simulators/linalg/bda/Matrix.hpp>

-namespace Opm
+namespace Opm::Accelerator {
+
+void sortRow(int *colIndices, int *data, int left, int right)
 {
-namespace Accelerator
-{
-
-using Opm::OpmLog;
-
-
-
-void sortRow(int *colIndices, int *data, int left, int right) {
    int l = left;
    int r = right;
    int middle = colIndices[(l + r) >> 1];
@ -67,14 +58,14 @@ void sortRow(int *colIndices, int *data, int left, int right) {
        sortRow(colIndices, data, l, right);
 }

-
 // LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
 // a = a - (b * c)
-void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
+template<class Scalar>
+void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size)
 {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
-            double temp = 0.0;
+            Scalar temp = 0.0;
            for (unsigned int k = 0; k < block_size; k++) {
                temp += b[block_size * row + k] * c[block_size * k + col];
            }
@ -84,11 +75,12 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
 }

 /*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
-
-void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
+template<class Scalar>
+void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size)
+{
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
-            double temp = 0;
+            Scalar temp = 0;
            for (unsigned int k = 0; k < block_size; k++) {
                temp += mat1[block_size * row + k] * mat2[block_size * k + col];
            }
@ -97,5 +89,10 @@ void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_si
    }
 }

-} // namespace Accelerator
-} // namespace Opm
+#define INSTANCE_TYPE(T) \
+    template void blockMultSub(double*, double*, double*, unsigned int); \
+    template void blockMult(double*, double*, double*, unsigned int);
+
+INSTANCE_TYPE(double)
+
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -20,44 +20,40 @@
 #ifndef OPM_BLOCKED_MATRIX_HPP
 #define OPM_BLOCKED_MATRIX_HPP

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
+template<class Scalar>
 class BlockedMatrix
 {
-
 public:
-
    /// Allocate BlockedMatrix and data arrays with given sizes
    /// \param[in] Nb               number of blockrows
    /// \param[in] nnzbs            number of nonzero blocks
    /// \param[in] block_size       the number of rows and columns for each block
    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
-    : nnzValues(new double[nnzbs_*block_size_*block_size_]),
-      colIndices(new int[nnzbs_*block_size_*block_size_]),
-      rowPointers(new int[Nb_+1]),
-      Nb(Nb_),
-      nnzbs(nnzbs_),
-      block_size(block_size_),
-      deleteNnzs(true),
-      deleteSparsity(true)
+        : nnzValues(new Scalar[nnzbs_*block_size_*block_size_])
+        , colIndices(new int[nnzbs_*block_size_*block_size_])
+        , rowPointers(new int[Nb_+1])
+        , Nb(Nb_)
+        , nnzbs(nnzbs_)
+        , block_size(block_size_)
+        , deleteNnzs(true)
+        , deleteSparsity(true)
    {}

    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
    /// \param[in] M              matrix to be copied
    BlockedMatrix(const BlockedMatrix& M)
-    : nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
-      colIndices(M.colIndices),
-      rowPointers(M.rowPointers),
-      Nb(M.Nb),
-      nnzbs(M.nnzbs),
-      block_size(M.block_size),
-      deleteNnzs(true),
-      deleteSparsity(false)
+        : nnzValues(new Scalar[M.nnzbs*M.block_size*M.block_size])
+        , colIndices(M.colIndices)
+        , rowPointers(M.rowPointers)
+        , Nb(M.Nb)
+        , nnzbs(M.nnzbs)
+        , block_size(M.block_size)
+        , deleteNnzs(true)
+        , deleteSparsity(false)
    {}

    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
@ -67,18 +63,20 @@ public:
    /// \param[in] nnzValues      array of nonzero values, contains nnzb*block_size*block_size scalars
    /// \param[in] colIndices     array of column indices, contains nnzb entries
    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
-    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
-    : nnzValues(nnzValues_),
-      colIndices(colIndices_),
-      rowPointers(rowPointers_),
-      Nb(Nb_),
-      nnzbs(nnzbs_),
-      block_size(block_size_),
-      deleteNnzs(false),
-      deleteSparsity(false)
+    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_,
+                  Scalar* nnzValues_, int *colIndices_, int *rowPointers_)
+        : nnzValues(nnzValues_)
+        , colIndices(colIndices_)
+        , rowPointers(rowPointers_)
+        , Nb(Nb_)
+        , nnzbs(nnzbs_)
+        , block_size(block_size_)
+        , deleteNnzs(false)
+        , deleteSparsity(false)
    {}

-    ~BlockedMatrix(){
+    ~BlockedMatrix()
+    {
        if (deleteNnzs) {
            delete[] nnzValues;
        }
@ -88,8 +86,7 @@ public:
        }
    }

-
-    double *nnzValues;
+    Scalar* nnzValues;
    int *colIndices;
    int *rowPointers;
    int Nb;
@ -99,14 +96,13 @@ public:
    bool deleteSparsity;
 };

-
 /// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
 /// These ints aren't actually nonzeroes, but represent a mapping used later
 /// \param[inout] colIndices     represent keys in sorting
 /// \param[inout] data           sorted according to the colIndices
 /// \param[in] left              lower index of data of row
 /// \param[in] right             upper index of data of row
-void sortRow(int *colIndices, int *data, int left, int right);
+void sortRow(int* colIndices, int* data, int left, int right);

 /// Multiply and subtract blocks
 /// a = a - (b * c)
@ -114,7 +110,8 @@ void sortRow(int *colIndices, int *data, int left, int right);
 /// \param[in] b                 input block
 /// \param[in] c                 input block
 /// \param[in] block_size        size of block
-void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
+template<class Scalar>
+void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size);

 /// Perform a matrix-matrix multiplication on two blocks
 /// resMat = mat1 * mat2
@ -122,9 +119,9 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
 /// \param[in] mat2              input block 2
 /// \param[out] resMat           output block
 /// \param[in] block_size        size of block
-void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);
+template<class Scalar>
+void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size);

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@ -403,13 +403,13 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
    }
 } // end get_result()

-
 template <unsigned int block_size>
-SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                          double *b,
-                                                          [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                          [[maybe_unused]] WellContributions& wellContribs,
-                                                          BdaResult &res)
+SolverStatus amgclSolverBackend<block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+             double *b,
+             [[maybe_unused]] std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+             [[maybe_unused]] WellContributions& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix->Nb, matrix->nnzbs);
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@ -140,8 +140,11 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                              double *b,
+                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                              WellContributions& wellContribs,
+                              BdaResult& res) override;
    
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -216,9 +216,11 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
    }
 }

-
 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+void cusparseSolverBackend<block_size>::
+initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -309,7 +311,11 @@ void cusparseSolverBackend<block_size>::finalize() {


 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
+void cusparseSolverBackend<block_size>::
+copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+                   double *b,
+                   std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+{
    Timer t;

    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
@ -356,7 +362,11 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block

 // don't copy rowpointers and colindices, they stay the same
 template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
+void cusparseSolverBackend<block_size>::
+update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+                     double *b,
+                     std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+{
    Timer t;

    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
@ -523,14 +533,13 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
-
 template <unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                              double *b,
-                                                              std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                              WellContributions& wellContribs,
-                                                              BdaResult &res)
+SolverStatus cusparseSolverBackend<block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+             double *b,
+             std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+             WellContributions& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix, jacMatrix);
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@ -82,7 +82,8 @@ private:
    /// Initialize GPU and allocate memory
    /// \param[in] matrix         matrix for spmv
    /// \param[in] jacMatrix      matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);

    /// Clean memory
    void finalize();
@ -92,14 +93,18 @@ private:
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+                            double *b,
+                            std::shared_ptr<BlockedMatrix<double>> jacMatrix);

    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    /// also copy matrix for preconditioner if needed
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+                              double *b,
+                              std::shared_ptr<BlockedMatrix<double>> jacMatrix);

    /// Analyse sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
@ -134,8 +139,10 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                              double *b,
+                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                              WellContributions& wellContribs, BdaResult& res) override;
    
    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@ -50,14 +50,15 @@ BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_) :


 template <unsigned int block_size>
-bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }


 template <unsigned int block_size>
-bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
+bool BILU0<block_size>::analyze_matrix(BlockedMatrix<double>* mat,
+                                       BlockedMatrix<double>* jacMat)
 {
    const unsigned int bs = block_size;

@ -77,7 +78,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
        CSCRowIndices.resize(matToDecompose->nnzbs);
        CSCColPointers.resize(Nb + 1);

-        LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
+        LUmat = std::make_unique<BlockedMatrix<double>>(*matToDecompose);

        Timer t_convert;
        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
@ -87,7 +88,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
            OpmLog::info(out.str());
        }
    } else {
-        LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
+        LUmat = std::make_unique<BlockedMatrix<double>>(*matToDecompose);
    }

    Timer t_analysis;
@ -168,17 +169,16 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    return true;
 }

-
-
 template <unsigned int block_size>
-bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
+bool BILU0<block_size>::create_preconditioner(BlockedMatrix<double>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }

-
 template <unsigned int block_size>
-bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
+bool BILU0<block_size>::
+create_preconditioner(BlockedMatrix<double>* mat,
+                      BlockedMatrix<double>* jacMat)
 {
    const unsigned int bs = block_size;

--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@ -53,9 +53,9 @@ class BILU0 : public Preconditioner<block_size>
    using Base::err;

 private:
-    std::unique_ptr<BlockedMatrix> LUmat = nullptr;
+    std::unique_ptr<BlockedMatrix<double>> LUmat = nullptr;
 #if CHOW_PATEL
-    std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
+    std::unique_ptr<BlockedMatrix<double>> Lmat = nullptr, Umat = nullptr;
 #endif
    std::vector<double> invDiagVals;
    std::vector<int> diagIndex;
@ -93,12 +93,14 @@ public:
    BILU0(bool opencl_ilu_parallel, int verbosity);

    // analysis, extract parallelism if specified
-    bool analyze_matrix(BlockedMatrix *mat) override;
-    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool analyze_matrix(BlockedMatrix<double>* mat) override;
+    bool analyze_matrix(BlockedMatrix<double>* mat,
+                        BlockedMatrix<double>* jacMat) override;

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix *mat) override;
-    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool create_preconditioner(BlockedMatrix<double>* mat) override;
+    bool create_preconditioner(BlockedMatrix<double>* mat,
+                               BlockedMatrix<double>* jacMat) override;

    // apply preconditioner, x = prec(y)
    // via Lz = y
--- a/opm/simulators/linalg/bda/opencl/BISAI.cpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp
@ -78,13 +78,14 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
 }

 template <unsigned int block_size>
-bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
+bool BISAI<block_size>::analyze_matrix(BlockedMatrix<double>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }

 template <unsigned int block_size>
-bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
+bool BISAI<block_size>::analyze_matrix(BlockedMatrix<double>* mat,
+                                       BlockedMatrix<double>* jacMat)
 {
    const unsigned int bs = block_size;
    auto *m = mat;
@ -176,7 +177,9 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
 }

 template <unsigned int block_size>
-bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
+bool BISAI<block_size>::
+create_preconditioner(BlockedMatrix<double>* mat,
+                      BlockedMatrix<double>* jacMat)
 {
    const unsigned int bs = block_size;

@ -274,7 +277,7 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
 }

 template <unsigned int block_size>
-bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+bool BISAI<block_size>::create_preconditioner(BlockedMatrix<double>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }
--- a/opm/simulators/linalg/bda/opencl/BISAI.hpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.hpp
@ -31,7 +31,7 @@ namespace Opm
 namespace Accelerator
 {

-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

 /// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
 /// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
@ -116,12 +116,14 @@ public:
    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;

    // analysis, extract parallelism
-    bool analyze_matrix(BlockedMatrix *mat) override;
-    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool analyze_matrix(BlockedMatrix<double>* mat) override;
+    bool analyze_matrix(BlockedMatrix<double>* mat,
+                        BlockedMatrix<double>* jacMat) override;

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix *mat) override;
-    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool create_preconditioner(BlockedMatrix<double>* mat) override;
+    bool create_preconditioner(BlockedMatrix<double>* mat,
+                               BlockedMatrix<double>* jacMat) override;

    // apply preconditioner, x = prec(y)
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@ -61,10 +61,9 @@ void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::sha
    bilu0->setOpencl(context, queue);
 }

-
 template <unsigned int block_size>
-bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
-
+bool CPR<block_size>::analyze_matrix(BlockedMatrix<double>* mat_)
+{
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
    this->N = Nb * block_size;
@ -76,7 +75,9 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
 }

 template <unsigned int block_size>
-bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+bool CPR<block_size>::analyze_matrix(BlockedMatrix<double>* mat_,
+                                     BlockedMatrix<double>* jacMat)
+{
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
    this->N = Nb * block_size;
@ -89,7 +90,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat)
 }

 template <unsigned int block_size>
-bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+bool CPR<block_size>::
+    create_preconditioner(BlockedMatrix<double>* mat_,
+                          BlockedMatrix<double>* jacMat)
+{
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_, jacMat);
    if (verbosity >= 3) {
@ -109,7 +113,8 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *
 }

 template <unsigned int block_size>
-bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
+bool CPR<block_size>::create_preconditioner(BlockedMatrix<double>* mat_)
+{
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_);
    if (verbosity >= 3) {
@ -212,9 +217,9 @@ void CPR<block_size>::opencl_upload() {
    }
 }

-
 template <unsigned int block_size>
-void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
+void CPR<block_size>::create_preconditioner_amg(BlockedMatrix<double>* mat_)
+{
    this->mat = mat_;

    coarse_vals.resize(nnzb);
--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@ -38,7 +38,7 @@ namespace Opm
 namespace Accelerator
 {

-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

 /// This class implements a Constrained Pressure Residual (CPR) preconditioner
 template <unsigned int block_size>
@ -73,7 +73,7 @@ private:
    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once

    std::unique_ptr<BILU0<block_size> > bilu0;                    // Blocked ILU0 preconditioner
-    BlockedMatrix *mat = nullptr;    // input matrix, blocked
+    BlockedMatrix<double>* mat = nullptr;    // input matrix, blocked

    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
    using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
@ -112,13 +112,14 @@ private:

    void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);

-    void create_preconditioner_amg(BlockedMatrix *mat);
+    void create_preconditioner_amg(BlockedMatrix<double>* mat);

 public:
    CPR(bool opencl_ilu_parallel, int verbosity);

-    bool analyze_matrix(BlockedMatrix *mat) override;
-    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool analyze_matrix(BlockedMatrix<double>* mat) override;
+    bool analyze_matrix(BlockedMatrix<double>* mat,
+                        BlockedMatrix<double>* jacMat) override;

    // set own Opencl variables, but also that of the bilu0 preconditioner
    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
@ -127,8 +128,9 @@ public:
    // also applies amg for pressure component
    void apply(const cl::Buffer& y, cl::Buffer& x) override;

-    bool create_preconditioner(BlockedMatrix *mat) override;
-    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool create_preconditioner(BlockedMatrix<double>* mat) override;
+    bool create_preconditioner(BlockedMatrix<double>* mat,
+                               BlockedMatrix<double>* jacMat) override;
 };

 // solve A^T * x = b
--- a/opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp
+++ b/opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp
@ -54,7 +54,8 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
    upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
 }

-void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
+void OpenclMatrix::upload(cl::CommandQueue* queue, BlockedMatrix<double>* matrix)
+{
    if (matrix->block_size != block_size) {
        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
    }
--- a/opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp
+++ b/opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp
@ -30,7 +30,7 @@ namespace Accelerator
 {

 class Matrix;
-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

 /// This struct resembles a csr matrix, only doubles are supported
 /// The matrix data is stored in OpenCL Buffers
@ -50,7 +50,7 @@ public:

    void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
    void upload(cl::CommandQueue *queue, Matrix *matrix);
-    void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
+    void upload(cl::CommandQueue* queue, BlockedMatrix<double>* matrix);

    cl::Buffer nnzValues;
    cl::Buffer colIndices;
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@ -60,21 +60,26 @@ Preconditioner<block_size>::create(Type type, bool opencl_ilu_parallel, int verb
 }

 template <unsigned int block_size>
-bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+bool Preconditioner<block_size>::
+analyze_matrix(BlockedMatrix<double>* mat,
+               [[maybe_unused]] BlockedMatrix<double>* jacMat)
+{
    return analyze_matrix(mat);
 }

 template <unsigned int block_size>
-bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+bool Preconditioner<block_size>::
+create_preconditioner(BlockedMatrix<double>* mat,
+                      [[maybe_unused]] BlockedMatrix<double>* jacMat)
+{
    return create_preconditioner(mat);
 }

 #define INSTANTIATE_BDA_FUNCTIONS(n)  \
 template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(Type, bool, int);         \
 template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
-template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *);                             \
-template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
-
+template bool Preconditioner<n>::analyze_matrix(BlockedMatrix<double> *, BlockedMatrix<double> *);                             \
+template bool Preconditioner<n>::create_preconditioner(BlockedMatrix<double> *, BlockedMatrix<double> *);

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@ -29,7 +29,7 @@ namespace Opm
 namespace Accelerator
 {

-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

 template <unsigned int block_size>
 class Preconditioner
@ -73,13 +73,15 @@ public:
    // analyze matrix, e.g. the sparsity pattern
    // probably only called once
    // the version with two params can be overloaded, if not, it will default to using the one param version
-    virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
-    virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
+    virtual bool analyze_matrix(BlockedMatrix<double>* mat) = 0;
+    virtual bool analyze_matrix(BlockedMatrix<double>* mat,
+                                BlockedMatrix<double>* jacMat);

    // create/update preconditioner, probably used every linear solve
    // the version with two params can be overloaded, if not, it will default to using the one param version
-    virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
-    virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
+    virtual bool create_preconditioner(BlockedMatrix<double>* mat) = 0;
+    virtual bool create_preconditioner(BlockedMatrix<double>* mat,
+                                       BlockedMatrix<double>* jacMat);
 };

 } //namespace Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@ -405,9 +405,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    }
 }

-
 template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+void openclSolverBackend<block_size>::
+initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -637,13 +639,13 @@ void openclSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
 template <unsigned int block_size>
-SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                           double *b,
-                                                           std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                           WellContributions& wellContribs,
-                                                           BdaResult &res)
+SolverStatus openclSolverBackend<block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+             double *b,
+             std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+             WellContributions& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix, jacMatrix);
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@ -67,8 +67,8 @@ private:
                                                                  // can perform blocked ILU0 and AMG on pressure component
    bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
    bool analysis_done = false;
-    std::shared_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
-    std::shared_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
+    std::shared_ptr<BlockedMatrix<double>> mat{};                 // original matrix
+    std::shared_ptr<BlockedMatrix<double>> jacMat{};              // matrix for preconditioner
    bool opencl_ilu_parallel;                                     // parallelize ILU operations (with level_scheduling)
    std::vector<cl::Event> events;
    cl_int err;
@ -81,7 +81,8 @@ private:
    /// Initialize GPU and allocate memory
    /// \param[in] matrix     matrix A
    /// \param[in] jacMatrix  matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);

    /// Copy linear system to GPU
    void copy_system_to_gpu();
@ -134,8 +135,11 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                              double *b,
+                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                              WellContributions& wellContribs,
+                              BdaResult& res) override;

    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
    /// Data is already on the GPU
--- a/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp
@ -76,9 +76,9 @@ rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
    rocalution::stop_rocalution();
 }

-
 template <unsigned int block_size>
-void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
+void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix<double>* matrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -94,9 +94,9 @@ void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
    initialized = true;
 } // end initialize()

-
 template <unsigned int block_size>
-void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix) {
+void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix<double>* matrix)
+{
    Timer t;

    for(int i = 0; i < Nb+1; ++i){
@ -147,13 +147,13 @@ void rocalutionSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
 template <unsigned int block_size>
-SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                           double *b,
-                                                           [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                           [[maybe_unused]] WellContributions& wellContribs,
-                                                           BdaResult &res)
+SolverStatus rocalutionSolverBackend<block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+             double *b,
+             [[maybe_unused]] std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+             [[maybe_unused]] WellContributions& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix.get());
--- a/opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
@ -65,12 +65,12 @@ private:

    /// Initialize sizes and allocate memory
    /// \param[in] matrix     matrix A
-    void initialize(BlockedMatrix *matrix);
+    void initialize(BlockedMatrix<double>* matrix);

    /// Convert matrix to rocalution format
    /// copy matrix to raw pointers, which are given to and freed by rocalution
    /// \param[in] matrix     matrix A
-    void convert_matrix(BlockedMatrix *matrix);
+    void convert_matrix(BlockedMatrix<double>* matrix);

 public:

@ -91,8 +91,11 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                              double *b,
+                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                              WellContributions& wellContribs,
+                              BdaResult& res) override;

    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@ -375,9 +375,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
    }
 }

-
 template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+void rocsparseSolverBackend<block_size>::
+initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -634,11 +636,12 @@ void rocsparseSolverBackend<block_size>::get_result(double *x) {


 template <unsigned int block_size>
-SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                              double *b,
-                                                              std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                              WellContributions& wellContribs,
-                                                              BdaResult &res)
+SolverStatus rocsparseSolverBackend<block_size>::
+    solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                 double *b,
+                 std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                 WellContributions& wellContribs,
+                 BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix, jacMatrix);
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
@ -54,14 +54,13 @@ class rocsparseSolverBackend : public BdaSolver<block_size>
    using Base::initialized;

 private:
-
    double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU

    bool useJacMatrix = false;

    bool analysis_done = false;
-    std::shared_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
-    std::shared_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
+    std::shared_ptr<BlockedMatrix<double>> mat{};                 // original matrix
+    std::shared_ptr<BlockedMatrix<double>> jacMat{};              // matrix for preconditioner
    int nnzbs_prec = 0;    // number of nnz blocks in preconditioner matrix M

    rocsparse_direction dir = rocsparse_direction_row;
@ -93,7 +92,8 @@ private:
    /// Initialize GPU and allocate memory
    /// \param[in] matrix     matrix A
    /// \param[in] jacMatrix  matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);

    /// Copy linear system to GPU
    /// \param[in] b              input vector, contains N values
@ -138,8 +138,11 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+                              double* b,
+                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                              WellContributions& wellContribs,
+                              BdaResult& res) override;

    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
    /// Data is already on the GPU