Changed block_size template to variable for BlockedMatrix

2025-02-25 18:55:30 -06:00 · 2021-11-12 16:11:53 +01:00 · 2021-11-12 16:11:53 +01:00 · 9acffab47e
commit 9acffab47e
parent a6b6a62b27
18 changed files with 73 additions and 87 deletions
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -54,7 +54,7 @@ BILU0<block_size>::~BILU0()
 }

    template <unsigned int block_size>
-    bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
+    bool BILU0<block_size>::init(BlockedMatrix *mat)
    {
        const unsigned int bs = block_size;

@ -67,14 +67,14 @@ BILU0<block_size>::~BILU0()
        int *CSCColPointers = nullptr;

        if (opencl_ilu_reorder == ILUReorder::NONE) {
-            LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
+            LUmat = std::make_unique<BlockedMatrix>(*mat);
        } else {
            toOrder.resize(Nb);
            fromOrder.resize(Nb);
            CSCRowIndices = new int[nnzbs];
            CSCColPointers = new int[Nb + 1];
-            rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
-            LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
+            rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
+            LUmat = std::make_unique<BlockedMatrix>(*rmat);

            Timer t_convert;
            csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
@ -122,8 +122,8 @@ BILU0<block_size>::~BILU0()
        invDiagVals = new double[mat->Nb * bs * bs];

 #if CHOW_PATEL
-        Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
-        Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+        Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+        Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
 #endif

        LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
@ -166,7 +166,7 @@ BILU0<block_size>::~BILU0()


    template <unsigned int block_size>
-    bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
+    bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
    {
        const unsigned int bs = block_size;
        auto *m = mat;
@ -174,7 +174,7 @@ BILU0<block_size>::~BILU0()
        if (opencl_ilu_reorder != ILUReorder::NONE) {
            m = rmat.get();
            Timer t_reorder;
-            reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
+            reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());

            if (verbosity >= 3){
                std::ostringstream out;
@ -308,8 +308,8 @@ void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
 #define INSTANTIATE_BDA_FUNCTIONS(n)                              \
 template BILU0<n>::BILU0(ILUReorder, int);                        \
 template BILU0<n>::~BILU0();                                      \
-template bool BILU0<n>::init(BlockedMatrix<n>*);                  \
-template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
+template bool BILU0<n>::init(BlockedMatrix*);                     \
+template bool BILU0<n>::create_preconditioner(BlockedMatrix*);    \
 template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&);    \
 template void BILU0<n>::setOpenCLContext(cl::Context*);           \
 template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -46,10 +46,10 @@ namespace Accelerator
        int Nb;      // number of blockrows of the matrix
        int nnz;     // number of nonzeroes of the matrix (scalar)
        int nnzbs;   // number of blocks of the matrix
-        std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
-        std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
+        std::unique_ptr<BlockedMatrix> LUmat = nullptr;
+        std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
 #if CHOW_PATEL
-        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
+        std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
        double *invDiagVals = nullptr;
        std::vector<int> diagIndex;
@ -91,10 +91,10 @@ namespace Accelerator
        ~BILU0();

        // analysis
-        bool init(BlockedMatrix<block_size> *mat);
+        bool init(BlockedMatrix *mat);

        // ilu_decomposition
-        bool create_preconditioner(BlockedMatrix<block_size> *mat);
+        bool create_preconditioner(BlockedMatrix *mat);

        // apply preconditioner, x = prec(y)
        void apply(const cl::Buffer& y, cl::Buffer& x);
@ -112,7 +112,7 @@ namespace Accelerator
            return fromOrder.data();
        }

-        BlockedMatrix<block_size>* getRMat()
+        BlockedMatrix* getRMat()
        {
            return rmat.get();
        }
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -39,8 +39,7 @@ using Opm::OpmLog;

 /*Sort a row of matrix elements from a blocked CSR-format.*/

-template <unsigned int block_size>
-void sortBlockedRow(int *colIndices, double *data, int left, int right) {
+void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
    const unsigned int bs = block_size;
    int l = left;
    int r = right;
@ -65,10 +64,10 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right) {
    } while (l < r);

    if (left < r)
-        sortBlockedRow<bs>(colIndices, data, left, r);
+        sortBlockedRow(colIndices, data, left, r, bs);

    if (right > l)
-        sortBlockedRow<bs>(colIndices, data, l, right);
+        sortBlockedRow(colIndices, data, l, right, bs);
 }


@ -130,8 +129,7 @@ void blockVectMult(double *mat, double *vect, double scale, double *resVect, boo



-template <unsigned int block_size>
-int BlockedMatrix<block_size>::countUnblockedNnzs() {
+int BlockedMatrix::countUnblockedNnzs() {
    int numNnzsOverThreshold = 0;
    int totalNnzs = rowPointers[Nb];
    for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
@ -146,8 +144,7 @@ int BlockedMatrix<block_size>::countUnblockedNnzs() {
 * Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
 * If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
 */
-template <unsigned int block_size>
-void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
+void BlockedMatrix::unblock(Matrix *mat, bool isUMatrix) {
    const unsigned int bs = block_size;
    int valIndex = 0, nnzsPerRow;

@ -185,8 +182,7 @@ void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {

 /*Optimized version*/
 // ub* prefixes indicate unblocked data
-template <unsigned int block_size>
-int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
+int BlockedMatrix::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
                                     std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
                                     std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
 {
@ -225,8 +221,7 @@ int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isU
 // PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
 //               if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
 //               then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
-template <unsigned int block_size>
-int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
+int BlockedMatrix::findPartitionColumns(int numColors, int *nodesPerColor,
        int rowsPerColorLimit, int columnsPerColorLimit,
        std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
        std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
@ -474,7 +469,6 @@ void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::


 #define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
-template void sortBlockedRow<n>(int *, double *, int, int);                 \
 template void blockMultSub<n>(double *, double *, double *);                \
 template void blockMult<n>(double *, double *, double *);                   \

@ -490,15 +484,7 @@ INSTANTIATE_BDA_FUNCTIONS(6);
 #if HAVE_FPGA
 #define INSTANTIATE_BDA_FPGA_FUNCTIONS(n)                                             \
 template void blockSub<n>(double *, double *, double *);                              \
-template void blockVectMult<n>(double *, double *, double, double *, bool);           \
-template int BlockedMatrix<n>::toRDF(int, int *, bool,                                \
-    std::vector<std::vector<int> >& , int, int *,                                     \
-    std::vector<std::vector<double> >&, short int *, unsigned char *, int *,  int *); \
-template int BlockedMatrix<n>::findPartitionColumns(int, int *,                       \
-        int, int,                                                                     \
-        std::vector<std::vector<int> >& , int *, int *,                               \
-        std::vector<std::vector<int> >& , int *, int *,                               \
-        std::vector<std::vector<int> >& , int *, int *);
+template void blockVectMult<n>(double *, double *, double, double *, bool);

 INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
 INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -39,7 +39,6 @@ namespace Accelerator

 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
-template<unsigned int block_size>
 class BlockedMatrix
 {

@ -48,12 +47,14 @@ public:
    /// Allocate BlockedMatrix and data arrays with given sizes
    /// \param[in] Nb               number of blockrows
    /// \param[in] nnzbs            number of nonzero blocks
-    BlockedMatrix(int Nb_, int nnzbs_)
-    : nnzValues(new double[nnzbs_*block_size*block_size]),
-      colIndices(new int[nnzbs_*block_size*block_size]),
+    /// \param[in] block_size       the number of rows and columns for each block
+    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
+    : nnzValues(new double[nnzbs_*block_size_*block_size_]),
+      colIndices(new int[nnzbs_*block_size_*block_size_]),
      rowPointers(new int[Nb_+1]),
      Nb(Nb_),
      nnzbs(nnzbs_),
+      block_size(block_size_),
      deleteNnzs(true),
      deleteSparsity(true)
    {}
@ -61,11 +62,12 @@ public:
    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
    /// \param[in] M              matrix to be copied
    BlockedMatrix(const BlockedMatrix& M)
-    : nnzValues(new double[M.nnzbs*block_size*block_size]),
+    : nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
      colIndices(M.colIndices),
      rowPointers(M.rowPointers),
      Nb(M.Nb),
      nnzbs(M.nnzbs),
+      block_size(M.block_size),
      deleteNnzs(true),
      deleteSparsity(false)
    {}
@ -73,15 +75,17 @@ public:
    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
    /// \param[in] Nb             number of blockrows
    /// \param[in] nnzbs          number of nonzero blocks
+    /// \param[in] block_size     the number of rows and columns for each block
    /// \param[in] nnzValues      array of nonzero values, contains nnzb*block_size*block_size scalars
    /// \param[in] colIndices     array of column indices, contains nnzb entries
    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
-    BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
+    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
    : nnzValues(nnzValues_),
      colIndices(colIndices_),
      rowPointers(rowPointers_),
      Nb(Nb_),
      nnzbs(nnzbs_),
+      block_size(block_size_),
      deleteNnzs(false),
      deleteSparsity(false)
    {}
@ -123,6 +127,7 @@ public:
    int *rowPointers;
    int Nb;
    int nnzbs;
+    unsigned int block_size;
    bool deleteNnzs;
    bool deleteSparsity;
 };
@ -133,8 +138,8 @@ public:
 /// \param[inout] data           
 /// \param[in] left              lower index of data of row
 /// \param[in] right             upper index of data of row
-template <unsigned int block_size>
-void sortBlockedRow(int *colIndices, double *data, int left, int right);
+/// \param[in] block_size        size of blocks in the row
+void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);

 /// Multiply and subtract blocks
 /// a = a - (b * c)
--- a/opm/simulators/linalg/bda/CPR.cpp
+++ b/opm/simulators/linalg/bda/CPR.cpp
@ -106,7 +106,7 @@ void solve_transposed_3x3(const double *A, const double *b, double *x) {


 template <unsigned int block_size>
-void CPR<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat_) {
+void CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
    this->mat = mat_;

    try{
@ -498,11 +498,8 @@ void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {



-#define INSTANTIATE_BDA_FUNCTIONS(n)                                     \
-template CPR<n>::CPR(int, ILUReorder);                                   \
-template void CPR<n>::init(int, int, std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
-template void CPR<n>::apply(const cl::Buffer&, cl::Buffer&);             \
-template void CPR<n>::create_preconditioner(BlockedMatrix<n> *mat);
+#define INSTANTIATE_BDA_FUNCTIONS(n)  \
+template class CPR<n>;

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/CPR.hpp
+++ b/opm/simulators/linalg/bda/CPR.hpp
@ -66,7 +66,7 @@ private:
    std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once

-    BlockedMatrix<block_size> *mat = nullptr;    // input matrix, blocked
+    BlockedMatrix *mat = nullptr;    // input matrix, blocked
    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
    using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
    using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
@ -106,7 +106,7 @@ public:
    // apply preconditioner, x = prec(y)
    void apply(const cl::Buffer& y, cl::Buffer& x);

-    void create_preconditioner(BlockedMatrix<block_size> *mat);
+    void create_preconditioner(BlockedMatrix *mat);

 };

--- a/opm/simulators/linalg/bda/ChowPatelIlu.cpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
@ -482,7 +482,7 @@ __kernel void chow_patel_ilu_sweep(
 template <unsigned int block_size>
 void ChowPatelIlu<block_size>::decomposition(
    cl::CommandQueue *queue, [[maybe_unused]] cl::Context *context,
-    BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
+    BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
    double *invDiagVals, std::vector<int>& diagIndex,
    cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
    cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
--- a/opm/simulators/linalg/bda/ChowPatelIlu.hpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.hpp
@ -82,7 +82,7 @@ public:
    /// This function calls gpu_decomposition() if CHOW_PATEL_GPU is set
    void decomposition(
        cl::CommandQueue *queue, cl::Context *context,
-        BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
+        BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
        double *invDiagVals, std::vector<int>& diagIndex,
        cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
        cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
--- a/opm/simulators/linalg/bda/FPGABILU0.cpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.cpp
@ -59,7 +59,7 @@ FPGABILU0<block_size>::~FPGABILU0()


 template <unsigned int block_size>
-bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
+bool FPGABILU0<block_size>::init(BlockedMatrix *mat)
 {
    const unsigned int bs = block_size;

@ -91,8 +91,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
    }

    Timer t_analysis;
-    rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
-    LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
+    rMat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
+    LUMat = std::make_unique<BlockedMatrix>(*rMat);
    std::ostringstream out;
    if (level_scheduling) {
        out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
@ -117,7 +117,7 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
    int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
    int blockDiagSize = 0;
    // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
-    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
+    reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());
    int doneRows = 0;
    for (int c = 0; c < numColors; c++) {
        for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
@ -187,8 +187,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)

    diagIndex.resize(mat->Nb, 0);
    invDiagVals = new double[mat->Nb * bs * bs];
-    LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
-    UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    LMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
+    UMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
    resultPointers[0] = (void *) colorSizes.data();
    resultPointers[1] = (void *) PIndicesAddr.data();
    resultPointers[2] = (void *) nnzValues.data();
@ -232,11 +232,11 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)


 template <unsigned int block_size>
-bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
+bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
 {
    const unsigned int bs = block_size;
    Timer t_reorder;
-    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
+    reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());

    if (verbosity >= 3) {
        std::ostringstream out;
@ -402,8 +402,8 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
 template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int);  \
 template FPGABILU0<n>::~FPGABILU0();                                    \
-template bool FPGABILU0<n>::init(BlockedMatrix<n> *);                   \
-template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *);  \
+template bool FPGABILU0<n>::init(BlockedMatrix*);                       \
+template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix *);

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
--- a/opm/simulators/linalg/bda/FPGABILU0.hpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.hpp
@ -45,8 +45,8 @@ private:
    int Nb;      // number of blockrows of the matrix
    int nnz;     // number of nonzeroes of the matrix (scalar)
    int nnzbs;   // number of blocks of the matrix
-    std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
-    std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
+    std::unique_ptr<BlockedMatrix> LMat = nullptr, UMat = nullptr, LUMat = nullptr;
+    std::shared_ptr<BlockedMatrix> rMat = nullptr; // reordered mat
    double *invDiagVals = nullptr;
    std::vector<int> diagIndex;
    std::vector<int> toOrder, fromOrder;
@ -82,10 +82,10 @@ public:
    ~FPGABILU0();

    // analysis (optional)
-    bool init(BlockedMatrix<block_size> *mat);
+    bool init(BlockedMatrix *mat);

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix<block_size> *mat);
+    bool create_preconditioner(BlockedMatrix *mat);

    int* getToOrder()
    {
@ -97,7 +97,7 @@ public:
        return fromOrder.data();
    }

-    BlockedMatrix<block_size>* getRMat()
+    BlockedMatrix* getRMat()
    {
        return rMat.get();
    }
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@ -262,7 +262,7 @@ void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double

    // allocate host memory for matrices and vectors
    // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
-    mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
+    mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));

    std::ostringstream oss;
    oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@ -56,8 +56,8 @@ private:
    bool level_scheduling = false;

    // LUMat will shallow copy rowPointers and colIndices of mat/rMat
-    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
-    BlockedMatrix<block_size> *rMat = nullptr;
+    std::unique_ptr<BlockedMatrix> mat = nullptr;
+    BlockedMatrix *rMat = nullptr;
    std::unique_ptr<Preconditioner> prec = nullptr;

    // vectors with data processed by the preconditioner (input to the kernel)
--- a/opm/simulators/linalg/bda/Matrix.cpp
+++ b/opm/simulators/linalg/bda/Matrix.cpp
@ -53,7 +53,7 @@ void OpenclMatrix<block_size>::upload(cl::CommandQueue *queue, Matrix *matrix) {
 }

 template <unsigned int block_size>
-void OpenclMatrix<block_size>::upload(cl::CommandQueue *queue, BlockedMatrix<block_size> *matrix) {
+void OpenclMatrix<block_size>::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
    upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
 }

--- a/opm/simulators/linalg/bda/Matrix.hpp
+++ b/opm/simulators/linalg/bda/Matrix.hpp
@ -50,7 +50,7 @@ public:

    void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
    void upload(cl::CommandQueue *queue, Matrix *matrix);
-    void upload(cl::CommandQueue *queue, BlockedMatrix<block_size> *matrix);
+    void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);

    cl::Buffer nnzValues;
    cl::Buffer colIndices;
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@ -177,10 +177,10 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
 /* Reorder a matrix by a specified input order.
 * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
 * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
+    assert(mat->block_size == rmat->block_size);

-template <unsigned int block_size>
-void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat) {
-    const unsigned int bs = block_size;
+    const unsigned int bs = mat->block_size;
    int rIndex = 0;
    int i, k;
    unsigned int j;
@ -204,7 +204,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
    }
    // re-sort the column indices of every row.
    for (i = 0; i < mat->Nb; i++) {
-        sortBlockedRow<bs>(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1);
+        sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
    }
 }

@ -370,7 +370,6 @@ void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices

 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                            \
 template int colorBlockedNodes<n>(int, const int *, const int *, const int *, const int *, std::vector<int>&, int, int);                        \
-template void reorderBlockedMatrixByPattern<n>(BlockedMatrix<n> *, int *, int *, BlockedMatrix<n> *);                                           \
 template void reorderBlockedVectorByPattern<n>(int, double*, int*, double*);                                                                    \
 template void findGraphColoring<n>(const int *, const int *, const int *, const int *, int, int, int, int *, int *, int *, std::vector<int>&);  \

--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@ -52,9 +52,8 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
 /// \param[in] mat           matrix to be reordered
 /// \param[in] toOrder       reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
 /// \param[in] fromOrder     reorder pattern that lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rMat       reordered Matrix 
-template <unsigned int block_size>
-void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat);
+/// \param[inout] rMat       reordered Matrix
+void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);

 /// Compute reorder mapping from the color that each node has received
 /// The toOrder, fromOrder and iters arrays must be allocated already
--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -407,7 +407,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
 #if COPY_ROW_BY_ROW
        vals_contiguous = new double[N];
 #endif
-        mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
+        mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));

        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -75,8 +75,8 @@ private:
    bool use_cpr;                                                 // allow to enable CPR
    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
    bool analysis_done = false;
-    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
-    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix (or original if no reordering), used for spmv
+    std::unique_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
+    BlockedMatrix *rmat = nullptr;                                // reordered matrix (or original if no reordering), used for spmv
    ILUReorder opencl_ilu_reorder;                                // reordering strategy
    std::vector<cl::Event> events;
    cl_int err;