Merge pull request #5380 from akva2/linalg_template_scalar

LinAlg classes: template Scalar type
2024-11-25 18:50:19 -06:00 · 2024-05-31 08:40:11 +02:00 · 2024-05-31 08:40:11 +02:00 · bcbac79486
commit bcbac79486
parent 8b17c18ead 41885f5911
57 changed files with 2346 additions and 1863 deletions
--- a/opencl-source-provider.cmake
+++ b/opencl-source-provider.cmake
@ -31,7 +31,7 @@ endif()
 foreach(CL ${CL_LIST})
  get_filename_component(FNAME ${CL} NAME_WE)

-  file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
+  file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels<double>::${FNAME}_str = R\"\( \n")
  file(READ "${CL}" CL_CONTENT)
  file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
  file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")
--- a/opm/simulators/linalg/ISTLSolverBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverBda.cpp
@ -50,15 +50,14 @@
 std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP

-namespace Opm {
-namespace detail {
+namespace Opm::detail {

 template<class Matrix, class Vector>
 BdaSolverInfo<Matrix,Vector>::
 BdaSolverInfo(const std::string& accelerator_mode,
              const int linear_solver_verbosity,
              const int maxit,
-              const double tolerance,
+              const Scalar tolerance,
              const int platformID,
              const int deviceID,
              const bool opencl_ilu_parallel,
@ -104,7 +103,7 @@ apply(Vector& rhs,
 {
    bool use_gpu = bridge_->getUseGpu();
    if (use_gpu) {
-        auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
+        auto wellContribs = WellContributions<Scalar>::create(accelerator_mode_, useWellConn);
        bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());

         // the WellContributions can only be applied separately with CUDA, OpenCL or rocsparse, not with amgcl or rocalution
@ -179,8 +178,9 @@ blockJacobiAdjacency(const Grid& grid,
    const auto& gridView = grid.leafGridView();
    auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows

-    //Loop over cells
-    for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
+    // Loop over cells
+    for (Iter row = blockJacobiForGPUILU0_->createbegin();
+              row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
    {
        const auto& elem = *elemIt;
        size_type idx = lid.id(elem);
@ -221,25 +221,26 @@ copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
        auto outerCol = (*outerRow).begin();
        for (auto col = (*row).begin(); col != (*row).end(); ++col) {
            // outerRow is guaranteed to have all column entries that row has!
-            while(outerCol.index() < col.index()) ++outerCol;
+            while (outerCol.index() < col.index()) {
+                ++outerCol;
+            }
            assert(outerCol.index() == col.index());
            *col = *outerCol; // copy nonzero block
        }
    }
 }

-template<int Dim>
-using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
-template<int Dim>
-using BV = Dune::BlockVector<Dune::FieldVector<double,Dim>>;
+template<class Scalar, int Dim>
+using BM = Dune::BCRSMatrix<MatrixBlock<Scalar,Dim,Dim>>;
+template<class Scalar, int Dim>
+using BV = Dune::BlockVector<Dune::FieldVector<Scalar,Dim>>;

-
-#define INSTANCE_GRID(Dim, Grid)                   \
-    template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
-    prepare(const Grid&, \
-            const Dune::CartesianIndexMapper<Grid>&, \
-            const std::vector<Well>&, \
-            const std::vector<int>&, \
+#define INSTANTIATE_GRID(T, Dim, Grid)                   \
+    template void BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>::   \
+    prepare(const Grid&,                                 \
+            const Dune::CartesianIndexMapper<Grid>&,     \
+            const std::vector<Well>&,                    \
+            const std::vector<int>&,                     \
            const std::size_t, const bool);
 using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
 #if HAVE_DUNE_ALUGRID
@ -248,23 +249,26 @@ using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
 #else
    using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridNoComm>;
 #endif //HAVE_MPI
-#define INSTANCE(Dim) \
-    template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
-    INSTANCE_GRID(Dim,Dune::CpGrid) \
-    INSTANCE_GRID(Dim,ALUGrid3CN) \
-    INSTANCE_GRID(Dim,PolyHedralGrid3D)
+#define INSTANTIATE(T,Dim)                              \
+    template struct BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>; \
+    INSTANTIATE_GRID(T,Dim,Dune::CpGrid)                \
+    INSTANTIATE_GRID(T,Dim,ALUGrid3CN)                  \
+    INSTANTIATE_GRID(T,Dim,PolyHedralGrid3D)
 #else
-#define INSTANCE(Dim) \
-    template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
-    INSTANCE_GRID(Dim,Dune::CpGrid) \
-    INSTANCE_GRID(Dim,PolyHedralGrid3D)
+#define INSTANTIATE(T,Dim)                              \
+    template struct BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>; \
+    INSTANTIATE_GRID(T,Dim,Dune::CpGrid)                \
+    INSTANTIATE_GRID(T,Dim,PolyHedralGrid3D)
 #endif
-INSTANCE(1)
-INSTANCE(2)
-INSTANCE(3)
-INSTANCE(4)
-INSTANCE(5)
-INSTANCE(6)

-} // namespace detail
-} // namespace Opm
+#define INSTANTIATE_TYPE(T) \
+    INSTANTIATE(T,1)        \
+    INSTANTIATE(T,2)        \
+    INSTANTIATE(T,3)        \
+    INSTANTIATE(T,4)        \
+    INSTANTIATE(T,5)        \
+    INSTANTIATE(T,6)
+
+INSTANTIATE_TYPE(double)
+
+} // namespace Opm::detail
--- a/opm/simulators/linalg/ISTLSolverBda.hpp
+++ b/opm/simulators/linalg/ISTLSolverBda.hpp
@ -35,60 +35,61 @@ namespace Opm {
 class Well;

 template<class Matrix, class Vector, int block_size> class BdaBridge;
-class WellContributions;
+template<class Scalar> class WellContributions;
 namespace detail {

 template<class Matrix, class Vector>
 struct BdaSolverInfo
 {
-  using WellContribFunc = std::function<void(WellContributions&)>;
-  using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
+    using Scalar = typename Vector::field_type;
+    using WellContribFunc = std::function<void(WellContributions<Scalar>&)>;
+    using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;

-  BdaSolverInfo(const std::string& accelerator_mode,
-                const int linear_solver_verbosity,
-                const int maxit,
-                const double tolerance,
-                const int platformID,
-                const int deviceID,
-                const bool opencl_ilu_parallel,
-                const std::string& linsolver);
+    BdaSolverInfo(const std::string& accelerator_mode,
+                  const int linear_solver_verbosity,
+                  const int maxit,
+                  const Scalar tolerance,
+                  const int platformID,
+                  const int deviceID,
+                  const bool opencl_ilu_parallel,
+                  const std::string& linsolver);

-  ~BdaSolverInfo();
+    ~BdaSolverInfo();

-  template<class Grid>
-  void prepare(const Grid& grid,
-               const Dune::CartesianIndexMapper<Grid>& cartMapper,
-               const std::vector<Well>& wellsForConn,
-               const std::vector<int>& cellPartition,
-               const std::size_t nonzeroes,
-               const bool useWellConn);
+    template<class Grid>
+    void prepare(const Grid& grid,
+                 const Dune::CartesianIndexMapper<Grid>& cartMapper,
+                 const std::vector<Well>& wellsForConn,
+                 const std::vector<int>& cellPartition,
+                 const std::size_t nonzeroes,
+                 const bool useWellConn);

-  bool apply(Vector& rhs,
-             const bool useWellConn,
-             WellContribFunc getContribs,
-             const int rank,
-             Matrix& matrix,
-             Vector& x,
-             Dune::InverseOperatorResult& result);
+    bool apply(Vector& rhs,
+               const bool useWellConn,
+               WellContribFunc getContribs,
+               const int rank,
+               Matrix& matrix,
+               Vector& x,
+               Dune::InverseOperatorResult& result);

-  bool gpuActive();
+    bool gpuActive();

-  int numJacobiBlocks_ = 0;
+    int numJacobiBlocks_ = 0;

 private:
-  /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
-  /// Do not initialize the values, that is done in copyMatToBlockJac()
-  template<class Grid>
-  void blockJacobiAdjacency(const Grid& grid,
-                            const std::vector<int>& cell_part,
-                            std::size_t nonzeroes);
+    /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
+    /// Do not initialize the values, that is done in copyMatToBlockJac()
+    template<class Grid>
+    void blockJacobiAdjacency(const Grid& grid,
+                              const std::vector<int>& cell_part,
+                              std::size_t nonzeroes);

-  void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
+    void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);

-  std::unique_ptr<Bridge> bridge_;
-  std::string accelerator_mode_;
-  std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
-  std::vector<std::set<int>> wellConnectionsGraph_;
+    std::unique_ptr<Bridge> bridge_;
+    std::string accelerator_mode_;
+    std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
+    std::vector<std::set<int>> wellConnectionsGraph_;
 };

 }
@ -249,8 +250,8 @@ public:
        // Solve system.
        Dune::InverseOperatorResult result;

-        std::function<void(WellContributions&)> getContribs =
-            [this](WellContributions& w)
+        std::function<void(WellContributions<Scalar>&)> getContribs =
+            [this](WellContributions<Scalar>& w)
            {
                this->simulator_.problem().wellModel().getWellContributions(w);
            };
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -50,11 +50,11 @@
 #include <opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp>


-namespace Opm
-{
+namespace Opm {

 template <class Smoother>
-struct AMGSmootherArgsHelper {
+struct AMGSmootherArgsHelper
+{
    static auto args(const PropertyTree& prm)
    {
        using SmootherArgs = typename Dune::Amg::SmootherTraits<Smoother>::Arguments;
@ -69,10 +69,11 @@ struct AMGSmootherArgsHelper {
 };

 template <class M, class V, class C>
-struct AMGSmootherArgsHelper<Opm::ParallelOverlappingILU0<M, V, V, C>> {
+struct AMGSmootherArgsHelper<ParallelOverlappingILU0<M, V, V, C>>
+{
    static auto args(const PropertyTree& prm)
    {
-        using Smoother = Opm::ParallelOverlappingILU0<M, V, V, C>;
+        using Smoother = ParallelOverlappingILU0<M, V, V, C>;
        using SmootherArgs = typename Dune::Amg::SmootherTraits<Smoother>::Arguments;
        SmootherArgs smootherArgs;
        smootherArgs.iterations = prm.get<int>("iterations", 1);
@ -88,7 +89,6 @@ struct AMGSmootherArgsHelper<Opm::ParallelOverlappingILU0<M, V, V, C>> {
    }
 };

-
 // trailing return type with decltype used for detecting existence of setUseFixedOrder member function by overloading the setUseFixedOrder function
 template <typename C>
 auto setUseFixedOrder(C criterion, bool booleanValue) -> decltype(criterion.setUseFixedOrder(booleanValue))
@ -209,7 +209,7 @@ struct StandardPreconditioners {
                const std::string smoother = prm.get<std::string>("smoother", "ParOverILU0");
                // TODO: merge this with ILUn, and possibly simplify the factory to only work with ILU?
                if (smoother == "ILU0" || smoother == "ParOverILU0") {
-                    using Smoother = Opm::ParallelOverlappingILU0<M, V, V, C>;
+                    using Smoother = ParallelOverlappingILU0<M, V, V, C>;
                    auto crit = AMGHelper<O, C, M, V>::criterion(prm);
                    auto sargs = AMGSmootherArgsHelper<Smoother>::args(prm);
                    PrecPtr prec = std::make_shared<Dune::Amg::AMGCPR<O, V, Smoother, C>>(op, crit, sargs, comm);
@ -279,7 +279,8 @@ struct StandardPreconditioners {
                              OPM_THROW(std::logic_error,
                                        "Pressure index out of bounds. It needs to specified for CPR");
                          }
-                          using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Comm, false>;
+                          using Scalar = typename V::field_type;
+                          using LevelTransferPolicy = PressureTransferPolicy<O, Comm, Scalar, false>;
                          return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
                              op, prm, weightsCalculator, pressureIndex, comm);
                      });
@ -294,7 +295,8 @@ struct StandardPreconditioners {
                              OPM_THROW(std::logic_error,
                                        "Pressure index out of bounds. It needs to specified for CPR");
                          }
-                          using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Comm, true>;
+                          using Scalar = typename V::field_type;
+                          using LevelTransferPolicy = PressureTransferPolicy<O, Comm, Scalar, true>;
                          return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
                              op, prm, weightsCalculator, pressureIndex, comm);
                      });
@ -311,7 +313,8 @@ struct StandardPreconditioners {
                                  OPM_THROW(std::logic_error,
                                            "Pressure index out of bounds. It needs to specified for CPR");
                              }
-                              using LevelTransferPolicy = Opm::PressureBhpTransferPolicy<O, Comm, false>;
+                              using Scalar = typename V::field_type;
+                              using LevelTransferPolicy = PressureBhpTransferPolicy<O, Comm, Scalar, false>;
                              return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
                                  op, prm, weightsCalculator, pressureIndex, comm);
                          });
@ -321,12 +324,12 @@ struct StandardPreconditioners {
        F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
-            using CuILU0 = typename Opm::cuistl::
-                CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+            using CuILU0 = typename cuistl::
+                CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            auto cuILU0 = std::make_shared<CuILU0>(op.getmat(), w);

-            auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
-            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
+            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });

@ -334,21 +337,21 @@ struct StandardPreconditioners {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using CuJac =
-                typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+                typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            auto cuJac = std::make_shared<CuJac>(op.getmat(), w);

-            auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
-            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
+            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });

        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            using field_type = typename V::field_type;
-            using CuDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+            using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            auto cuDILU = std::make_shared<CuDILU>(op.getmat());

-            auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
-            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
+            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
 #endif
@ -368,11 +371,11 @@ struct StandardPreconditioners {
        // Already a parallel preconditioner. Need to pass comm, but no need to wrap it in a BlockPreconditioner.
        if (ilulevel == 0) {
            const std::size_t num_interior = interiorIfGhostLast(comm);
-            return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, Comm>>(
-                op.getmat(), comm, w, Opm::MILU_VARIANT::ILU, num_interior, redblack, reorder_spheres);
+            return std::make_shared<ParallelOverlappingILU0<M, V, V, Comm>>(
+                op.getmat(), comm, w, MILU_VARIANT::ILU, num_interior, redblack, reorder_spheres);
        } else {
-            return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, Comm>>(
-                op.getmat(), comm, ilulevel, w, Opm::MILU_VARIANT::ILU, redblack, reorder_spheres);
+            return std::make_shared<ParallelOverlappingILU0<M, V, V, Comm>>(
+                op.getmat(), comm, ilulevel, w, MILU_VARIANT::ILU, redblack, reorder_spheres);
        }
    }

@ -412,8 +415,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
        using P = PropertyTree;
        F::addCreator("ILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
-            return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
-                op.getmat(), 0, w, Opm::MILU_VARIANT::ILU);
+            return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
+                op.getmat(), 0, w, MILU_VARIANT::ILU);
        });
        F::addCreator("DuneILU", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
@ -424,14 +427,14 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
        F::addCreator("ParOverILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
            const int n = prm.get<int>("ilulevel", 0);
-            return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
-                op.getmat(), n, w, Opm::MILU_VARIANT::ILU);
+            return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
+                op.getmat(), n, w, MILU_VARIANT::ILU);
        });
        F::addCreator("ILUn", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const int n = prm.get<int>("ilulevel", 0);
            const double w = prm.get<double>("relaxation", 1.0);
-            return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
-                op.getmat(), n, w, Opm::MILU_VARIANT::ILU);
+            return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
+                op.getmat(), n, w, MILU_VARIANT::ILU);
        });
        F::addCreator("DILU", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            DUNE_UNUSED_PARAMETER(prm);
@ -513,11 +516,16 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
                }
            });
            F::addCreator("famg", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
-                auto crit = AMGHelper<O, C, M, V>::criterion(prm);
-                Dune::Amg::Parameters parms;
-                parms.setNoPreSmoothSteps(1);
-                parms.setNoPostSmoothSteps(1);
-                return getRebuildOnUpdateWrapper<Dune::Amg::FastAMG<O, V>>(op, crit, parms);
+                if  constexpr (std::is_same_v<typename V::field_type, float>) {
+                    OPM_THROW(std::logic_error, "famg requires UMFPack which is not available for floats");
+                    return nullptr;
+                } else {
+                    auto crit = AMGHelper<O, C, M, V>::criterion(prm);
+                    Dune::Amg::Parameters parms;
+                    parms.setNoPreSmoothSteps(1);
+                    parms.setNoPostSmoothSteps(1);
+                    return getRebuildOnUpdateWrapper<Dune::Amg::FastAMG<O, V>>(op, crit, parms);
+                }
            });
        }
        if constexpr (std::is_same_v<O, WellModelMatrixAdapter<M, V, V, false>>) {
@ -527,8 +535,9 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
                    if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
                        OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
                    }
+                    using Scalar = typename V::field_type;
                    using LevelTransferPolicy
-                        = Opm::PressureBhpTransferPolicy<O, Dune::Amg::SequentialInformation, false>;
+                        = PressureBhpTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, false>;
                    return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
                        op, prm, weightsCalculator, pressureIndex);
                });
@ -540,7 +549,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
                if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
                    OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
                }
-                using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Dune::Amg::SequentialInformation, false>;
+                using Scalar = typename V::field_type;
+                using LevelTransferPolicy = PressureTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, false>;
                return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
                    op, prm, weightsCalculator, pressureIndex);
            });
@ -550,7 +560,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
                if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
                    OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
                }
-                using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Dune::Amg::SequentialInformation, true>;
+                using Scalar = typename V::field_type;
+                using LevelTransferPolicy = PressureTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, true>;
                return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
                    op, prm, weightsCalculator, pressureIndex);
            });
@ -559,9 +570,9 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
        F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
-            using CuILU0 = typename Opm::cuistl::
-                CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
-            return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(
+            using CuILU0 = typename cuistl::
+                CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(
                std::make_shared<CuILU0>(op.getmat(), w));
        });

@ -571,10 +582,10 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
            using matrix_type_to =
                typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
-            using CuILU0 = typename Opm::cuistl::
-                CuSeqILU0<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
-            using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
-            using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
+            using CuILU0 = typename cuistl::
+                CuSeqILU0<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
+            using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
+            using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
            auto adapted = std::make_shared<Adapter>(std::make_shared<CuILU0>(converted->getConvertedMatrix(), w));
            converted->setUnderlyingPreconditioner(adapted);
@ -585,24 +596,24 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using CUJac =
-                typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
-            return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(
+                typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUJac>>(
                std::make_shared<CUJac>(op.getmat(), w));
        });

        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            using field_type = typename V::field_type;
-            using CUDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
-            return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
+            using CUDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
        });

        F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            using block_type = typename V::block_type;
            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
            using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
-            using CuDILU = typename Opm::cuistl::CuDILU<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
-            using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
-            using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
+            using CuDILU = typename cuistl::CuDILU<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
+            using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
+            using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
            auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
            converted->setUnderlyingPreconditioner(adapted);
@ -744,7 +755,7 @@ using OpFSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<Dune::FieldMatrix<double, Di
                                   Dune::BlockVector<Dune::FieldVector<double, Dim>>,
                                   Dune::BlockVector<Dune::FieldVector<double, Dim>>>;
 template <int Dim>
-using OpBSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<Opm::MatrixBlock<double, Dim, Dim>>,
+using OpBSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<MatrixBlock<double, Dim, Dim>>,
                                   Dune::BlockVector<Dune::FieldVector<double, Dim>>,
                                   Dune::BlockVector<Dune::FieldVector<double, Dim>>>;

--- a/opm/simulators/linalg/PressureBhpTransferPolicy.hpp
+++ b/opm/simulators/linalg/PressureBhpTransferPolicy.hpp
@ -76,31 +76,36 @@ namespace Opm

    namespace Details
    {
-        using PressureMatrixType = Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>>;
-        using PressureVectorType = Dune::BlockVector<Dune::FieldVector<double, 1>>;
-        using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType, PressureVectorType, PressureVectorType>;
-        template <class Comm>
+        template<class Scalar> using PressureMatrixType = Dune::BCRSMatrix<MatrixBlock<Scalar, 1, 1>>;
+        template<class Scalar> using PressureVectorType = Dune::BlockVector<Dune::FieldVector<Scalar, 1>>;
+        template<class Scalar> using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType<Scalar>,
+                                                                                 PressureVectorType<Scalar>,
+                                                                                 PressureVectorType<Scalar>>;
+        template<class Scalar, class Comm>
        using ParCoarseOperatorType
-            = Dune::OverlappingSchwarzOperator<PressureMatrixType, PressureVectorType, PressureVectorType, Comm>;
-        template <class Comm>
+            = Dune::OverlappingSchwarzOperator<PressureMatrixType<Scalar>,
+                                               PressureVectorType<Scalar>,
+                                               PressureVectorType<Scalar>,
+                                               Comm>;
+        template<class Scalar, class Comm>
        using CoarseOperatorType = std::conditional_t<std::is_same<Comm, Dune::Amg::SequentialInformation>::value,
-                                                      SeqCoarseOperatorType,
-                                                      ParCoarseOperatorType<Comm>>;
+                                                      SeqCoarseOperatorType<Scalar>,
+                                                      ParCoarseOperatorType<Scalar,Comm>>;
    } // namespace Details

-    template <class FineOperator, class Communication, bool transpose = false>
-    class PressureBhpTransferPolicy : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Communication>>
+    template<class FineOperator, class Communication, class Scalar, bool transpose = false>
+    class PressureBhpTransferPolicy : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Scalar,Communication>>
    {
    public:
-        typedef typename Details::CoarseOperatorType<Communication> CoarseOperator;
-        typedef Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator> ParentType;
-        typedef Communication ParallelInformation;
-        typedef typename FineOperator::domain_type FineVectorType;
+        using CoarseOperator = typename Details::CoarseOperatorType<Scalar,Communication>;
+        using ParentType = Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator>;
+        using ParallelInformation = Communication;
+        using FineVectorType= typename FineOperator::domain_type;

    public:
        PressureBhpTransferPolicy(const Communication& comm,
                                  const FineVectorType& weights,
-                                  const Opm::PropertyTree& prm,
+                                  const PropertyTree& prm,
                                  const std::size_t pressureIndex)
            : communication_(&const_cast<Communication&>(comm))
            , weights_(weights)
@ -109,7 +114,7 @@ namespace Opm
        {
        }

-        virtual void createCoarseLevelSystem(const FineOperator& fineOperator) override
+        void createCoarseLevelSystem(const FineOperator& fineOperator) override
        {
            OPM_TIMEBLOCK(createCoarseLevelSystem);
            using CoarseMatrix = typename CoarseOperator::matrix_type;
@ -164,7 +169,7 @@ namespace Opm
        this->operator_ = Dune::Amg::ConstructionTraits<CoarseOperator>::construct(oargs);
    }

-    virtual void calculateCoarseEntries(const FineOperator& fineOperator) override
+    void calculateCoarseEntries(const FineOperator& fineOperator) override
    {
        OPM_TIMEBLOCK(calculateCoarseEntries);
        const auto& fineMatrix = fineOperator.getmat();
@ -175,7 +180,7 @@ namespace Opm
            auto entryCoarse = rowCoarse->begin();
            for (auto entry = row->begin(), entryEnd = row->end(); entry != entryEnd; ++entry, ++entryCoarse) {
                assert(entry.index() == entryCoarse.index());
-                double matrix_el = 0;
+                Scalar matrix_el = 0;
                if (transpose) {
                    const auto& bw = weights_[entry.index()];
                    for (std::size_t i = 0; i < bw.size(); ++i) {
@ -203,7 +208,7 @@ namespace Opm
        }
    }

-    virtual void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
+    void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
    {
        OPM_TIMEBLOCK(moveToCoarseLevel);
        //NB we iterate over fine assumming welldofs is at the end
@ -214,7 +219,7 @@ namespace Opm

        for (auto block = begin; block != end; ++block) {
            const auto& bw = weights_[block.index()];
-            double rhs_el = 0.0;
+            Scalar rhs_el = 0.0;
            if (transpose) {
                rhs_el = (*block)[pressure_var_index_];
            } else {
@ -228,7 +233,7 @@ namespace Opm
        this->lhs_ = 0;
    }

-    virtual void moveToFineLevel(typename ParentType::FineDomainType& fine) override
+    void moveToFineLevel(typename ParentType::FineDomainType& fine) override
    {
        OPM_TIMEBLOCK(moveToFineLevel);
        //NB we iterate over fine assumming welldofs is at the end
@ -246,7 +251,7 @@ namespace Opm
        }
    }

-    virtual PressureBhpTransferPolicy* clone() const override
+    PressureBhpTransferPolicy* clone() const override
    {
        return new PressureBhpTransferPolicy(*this);
    }
--- a/opm/simulators/linalg/PressureTransferPolicy.hpp
+++ b/opm/simulators/linalg/PressureTransferPolicy.hpp
@ -28,39 +28,40 @@

 #include <cstddef>

-namespace Opm
-{
-
-namespace Details
-{
-    using PressureMatrixType = Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>>;
-    using PressureVectorType = Dune::BlockVector<Dune::FieldVector<double, 1>>;
-    using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType, PressureVectorType, PressureVectorType>;
-    template <class Comm>
+namespace Opm { namespace Details {
+    template<class Scalar> using PressureMatrixType = Dune::BCRSMatrix<MatrixBlock<Scalar, 1, 1>>;
+    template<class Scalar> using PressureVectorType = Dune::BlockVector<Dune::FieldVector<Scalar, 1>>;
+    template<class Scalar> using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType<Scalar>,
+                                                                             PressureVectorType<Scalar>,
+                                                                             PressureVectorType<Scalar>>;
+    template<class Scalar, class Comm>
    using ParCoarseOperatorType
-        = Dune::OverlappingSchwarzOperator<PressureMatrixType, PressureVectorType, PressureVectorType, Comm>;
-    template <class Comm>
+        = Dune::OverlappingSchwarzOperator<PressureMatrixType<Scalar>,
+                                           PressureVectorType<Scalar>,
+                                           PressureVectorType<Scalar>,
+                                           Comm>;
+    template<class Scalar, class Comm>
    using CoarseOperatorType = std::conditional_t<std::is_same<Comm, Dune::Amg::SequentialInformation>::value,
-                                                  SeqCoarseOperatorType,
-                                                  ParCoarseOperatorType<Comm>>;
+                                                  SeqCoarseOperatorType<Scalar>,
+                                                  ParCoarseOperatorType<Scalar,Comm>>;
 } // namespace Details



-template <class FineOperator, class Communication, bool transpose = false>
+template <class FineOperator, class Communication, class Scalar, bool transpose = false>
 class PressureTransferPolicy
-    : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Communication>>
+    : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Scalar,Communication>>
 {
 public:
-    typedef typename Details::CoarseOperatorType<Communication> CoarseOperator;
-    typedef Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator> ParentType;
-    typedef Communication ParallelInformation;
-    typedef typename FineOperator::domain_type FineVectorType;
+    using CoarseOperator = typename Details::CoarseOperatorType<Scalar,Communication>;
+    using ParentType = Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator>;
+    using ParallelInformation = Communication;
+    using FineVectorType = typename FineOperator::domain_type;

 public:
    PressureTransferPolicy(const Communication& comm,
                           const FineVectorType& weights,
-                           const Opm::PropertyTree& /*prm*/,
+                           const PropertyTree& /*prm*/,
                           int pressure_var_index)
        : communication_(&const_cast<Communication&>(comm))
        , weights_(weights)
@ -68,7 +69,7 @@ public:
    {
    }

-    virtual void createCoarseLevelSystem(const FineOperator& fineOperator) override
+    void createCoarseLevelSystem(const FineOperator& fineOperator) override
    {
        using CoarseMatrix = typename CoarseOperator::matrix_type;
        const auto& fineLevelMatrix = fineOperator.getmat();
@ -92,7 +93,7 @@ public:
        this->operator_ = Dune::Amg::ConstructionTraits<CoarseOperator>::construct(oargs);
    }

-    virtual void calculateCoarseEntries(const FineOperator& fineOperator) override
+    void calculateCoarseEntries(const FineOperator& fineOperator) override
    {
        const auto& fineMatrix = fineOperator.getmat();
        *coarseLevelMatrix_ = 0;
@ -102,7 +103,7 @@ public:
            auto entryCoarse = rowCoarse->begin();
            for (auto entry = row->begin(), entryEnd = row->end(); entry != entryEnd; ++entry, ++entryCoarse) {
                assert(entry.index() == entryCoarse.index());
-                double matrix_el = 0;
+                Scalar matrix_el = 0;
                if (transpose) {
                    const auto& bw = weights_[entry.index()];
                    for (std::size_t i = 0; i < bw.size(); ++i) {
@ -120,7 +121,7 @@ public:
        assert(rowCoarse == coarseLevelMatrix_->end());
    }

-    virtual void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
+    void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
    {
        // Set coarse vector to zero
        this->rhs_ = 0;
@ -129,7 +130,7 @@ public:

        for (auto block = begin; block != end; ++block) {
            const auto& bw = weights_[block.index()];
-            double rhs_el = 0.0;
+            Scalar rhs_el = 0.0;
            if (transpose) {
                rhs_el = (*block)[pressure_var_index_];
            } else {
@ -143,7 +144,7 @@ public:
        this->lhs_ = 0;
    }

-    virtual void moveToFineLevel(typename ParentType::FineDomainType& fine) override
+    void moveToFineLevel(typename ParentType::FineDomainType& fine) override
    {
        auto end = fine.end(), begin = fine.begin();

@ -159,7 +160,7 @@ public:
        }
    }

-    virtual PressureTransferPolicy* clone() const override
+    PressureTransferPolicy* clone() const override
    {
        return new PressureTransferPolicy(*this);
    }
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -52,56 +52,70 @@

 typedef Dune::InverseOperatorResult InverseOperatorResult;

-namespace Opm
-{
+namespace Opm {

-    using Opm::Accelerator::BdaResult;
-    using Opm::Accelerator::BdaSolver;
-    using Opm::Accelerator::SolverStatus;
+using Accelerator::BdaResult;
+using Accelerator::BdaSolver;
+using Accelerator::SolverStatus;

-template <class BridgeMatrix, class BridgeVector, int block_size>
-BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
-                                                             int linear_solver_verbosity,
-                                                             [[maybe_unused]] int maxit,
-                                                             [[maybe_unused]] double tolerance,
-                                                             [[maybe_unused]] unsigned int platformID,
-                                                             [[maybe_unused]] unsigned int deviceID,
-                                                             [[maybe_unused]] bool opencl_ilu_parallel,
-                                                             [[maybe_unused]] std::string linsolver)
-: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
+template<class BridgeMatrix, class BridgeVector, int block_size>
+BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+BdaBridge(std::string accelerator_mode_,
+          int linear_solver_verbosity,
+          [[maybe_unused]] int maxit,
+          [[maybe_unused]] Scalar tolerance,
+          [[maybe_unused]] unsigned int platformID,
+          [[maybe_unused]] unsigned int deviceID,
+          [[maybe_unused]] bool opencl_ilu_parallel,
+          [[maybe_unused]] std::string linsolver)
+    : verbosity(linear_solver_verbosity)
+    , accelerator_mode(accelerator_mode_)
 {
    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
-        backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
+        using CU = Accelerator::cusparseSolverBackend<Scalar,block_size>;
+        backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
    } else if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
        use_gpu = true;
-        backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
+        using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
+        backend = std::make_unique<OCL>(linear_solver_verbosity,
+                                        maxit,
+                                        tolerance,
+                                        platformID,
+                                        deviceID,
+                                        opencl_ilu_parallel,
+                                        linsolver);
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
    } else if (accelerator_mode.compare("amgcl") == 0) {
 #if HAVE_AMGCL
        use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        backend.reset(new Opm::Accelerator::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
+        using AMGCL = Accelerator::amgclSolverBackend<Scalar,block_size>;
+        backend = std::make_unique<AMGCL>(linear_solver_verbosity, maxit,
+                                          tolerance, platformID, deviceID);
 #else
        OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake");
 #endif
    } else if (accelerator_mode.compare("rocalution") == 0) {
 #if HAVE_ROCALUTION
        use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        backend.reset(new Opm::Accelerator::rocalutionSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
+        using ROCA = Accelerator::rocalutionSolverBackend<Scalar,block_size>;
+        backend = std::make_unique<ROCA>(linear_solver_verbosity, maxit, tolerance);
 #else
        OPM_THROW(std::logic_error, "Error rocalutionSolver was chosen, but rocalution was not found by CMake");
 #endif
    } else if (accelerator_mode.compare("rocsparse") == 0) {
 #if HAVE_ROCSPARSE
        use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        backend.reset(new Opm::Accelerator::rocsparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
+        using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
+        backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
+                                         tolerance, platformID, deviceID);
 #else
        OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
 #endif
@ -112,13 +126,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
    }
 }

-
-
 template <class BridgeMatrix>
-int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::size_type>& diag_indices) {
+int replaceZeroDiagonal(BridgeMatrix& mat,
+                    std::vector<typename BridgeMatrix::size_type>& diag_indices)
+{
+    using Scalar = typename BridgeMatrix::field_type;
    int numZeros = 0;
    const int dim = mat[0][0].N();                    // might be replaced with BridgeMatrix::block_type::size()
-    const double zero_replace = 1e-15;
+    const Scalar zero_replace = 1e-15;
    if (diag_indices.empty()) {
        int Nb = mat.N();
        diag_indices.reserve(Nb);
@ -134,7 +149,7 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
            }
            diag_indices.emplace_back(diag.offset());
        }
-    }else{
+    } else {
        for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
            typename BridgeMatrix::size_type offset = diag_indices[r.index()];
            auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
@ -151,13 +166,15 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
    return numZeros;
 }

-
 // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
 // sparsity pattern should stay the same
 // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
-
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+copySparsityPatternFromISTL(const BridgeMatrix& mat,
+                            std::vector<int>& h_rows,
+                            std::vector<int>& h_cols)
+{
    h_rows.clear();
    h_cols.clear();

@ -172,17 +189,19 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI

    // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
    if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
-        OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
+        OPM_THROW(std::logic_error,
+                  "Error size of rows do not sum to number of nonzeroes "
+                  "in BdaBridge::copySparsityPatternFromISTL()");
    }
 }

-
 // check if the nnz values of the matrix are in contiguous memory
 // this is done by checking if the distance between the last value of the last block of row 0 and
 // the first value of the first row of row 1 is equal to 1
 // if the matrix only has 1 row, it is always contiguous
 template <class BridgeMatrix>
-void checkMemoryContiguous(const BridgeMatrix& mat) {
+void checkMemoryContiguous(const BridgeMatrix& mat)
+{
    auto block_size = mat[0][0].N();
    auto row = mat.begin();
    auto last_of_row0 = row->begin();
@ -199,14 +218,14 @@ void checkMemoryContiguous(const BridgeMatrix& mat) {
    }
 }

-
 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
-                                                                     BridgeMatrix* jacMat,
-                                                                     int numJacobiBlocks,
-                                                                     BridgeVector& b,
-                                                                     WellContributions& wellContribs,
-                                                                     InverseOperatorResult& res)
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+solve_system(BridgeMatrix* bridgeMat,
+             BridgeMatrix* jacMat,
+             int numJacobiBlocks,
+             BridgeVector& b,
+             WellContributions<Scalar>& wellContribs,
+             InverseOperatorResult& res)
 {
    if (use_gpu) {
        BdaResult result;
@ -221,38 +240,48 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
            return;
        }

+        using Mat = Accelerator::BlockedMatrix<Scalar>;
        if (!matrix) {
            h_rows.reserve(Nb+1);
            h_cols.reserve(nnzb);
            copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
            checkMemoryContiguous(*bridgeMat);
-            matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
+            matrix = std::make_unique<Mat>(Nb, nnzb, block_size,
+                                           static_cast<Scalar*>(&(((*bridgeMat)[0][0][0][0]))),
+                                           h_cols.data(),
+                                           h_rows.data());
        }

        Dune::Timer t_zeros;
        int numZeros = replaceZeroDiagonal(*bridgeMat, diagIndices);
        if (verbosity >= 2) {
            std::ostringstream out;
-            out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
+            out << "Checking zeros took: " << t_zeros.stop() << " s, found "
+                << numZeros << " zeros";
            OpmLog::info(out.str());
        }

        if (numJacobiBlocks >= 2) {
-            const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
+            const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes()
+                                                    : h_jacRows.back();

            if (!jacMatrix) {
                h_jacRows.reserve(Nb+1);
                h_jacCols.reserve(jacNnzb);
                copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
                checkMemoryContiguous(*jacMat);
-                jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
+                jacMatrix = std::make_unique<Mat>(Nb, jacNnzb, block_size,
+                                                  static_cast<Scalar*>(&(((*jacMat)[0][0][0][0]))),
+                                                  h_jacCols.data(),
+                                                  h_jacRows.data());
            }

            Dune::Timer t_zeros2;
            int jacNumZeros = replaceZeroDiagonal(*jacMat, jacDiagIndices);
            if (verbosity >= 2) {
                std::ostringstream out;
-                out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
+                out << "Checking zeros for jacMat took: " << t_zeros2.stop()
+                    << " s, found " << jacNumZeros << " zeros";
                OpmLog::info(out.str());
            }
        }
@ -260,17 +289,23 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
        /////////////////////////
        // actually solve
        // assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
-        SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
+        SolverStatus status = backend->solve_system(matrix,
+                                                    static_cast<Scalar*>(&(b[0][0])),
+                                                    jacMatrix, wellContribs, result);

-        switch(status) {
+        switch (status) {
        case SolverStatus::BDA_SOLVER_SUCCESS:
            //OpmLog::info("BdaSolver converged");
            break;
        case SolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
-            OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            OpmLog::warning("BdaSolver could not analyse level information of matrix, "
+                            "perhaps there is still a 0.0 on the diagonal of a "
+                            "block on the diagonal");
            break;
        case SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
-            OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            OpmLog::warning("BdaSolver could not create preconditioner, "
+                            "perhaps there is still a 0.0 on the diagonal "
+                            "of a block on the diagonal");
            break;
        default:
            OpmLog::warning("BdaSolver returned unknown status code");
@ -286,21 +321,27 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
    }
 }

-
 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unused]] BridgeVector& x) {
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+get_result([[maybe_unused]] BridgeVector& x)
+{
    if (use_gpu) {
-        backend->get_result(static_cast<double*>(&(x[0][0])));
+        backend->get_result(static_cast<Scalar*>(&(x[0][0])));
    }
 }

 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
-                                                                              [[maybe_unused]] unsigned N) {
-    if(accelerator_mode.compare("opencl") == 0){
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+initWellContributions([[maybe_unused]] WellContributions<Scalar>& wellContribs,
+                      [[maybe_unused]] unsigned N)
+{
+    if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
-        const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
-        static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
+        using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
+        const auto openclBackend = static_cast<const OCL*>(backend.get());
+        using WCOCL = WellContributionsOCL<Scalar>;
+        static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
+                                                       openclBackend->queue.get());
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
@ -309,23 +350,20 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
 }

 // the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                           \
-template class BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,   \
-Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                               \
-n>;                                                                                                                            \
-                                                                                                                               \
-template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<double, n, n>, std::allocator<Dune::FieldMatrix<double, n, n> > >, \
-Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                               \
-n>;
+#define INSTANTIATE_BDA_FUNCTIONS(T,n)                                     \
+    template class BdaBridge<Dune::BCRSMatrix<MatrixBlock<T,n,n>>,         \
+                   Dune::BlockVector<Dune::FieldVector<T,n>>,n>;           \
+    template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<T,n,n>>,   \
+                             Dune::BlockVector<Dune::FieldVector<T,n>>,n>;

+#define INSTANTIATE_TYPE(T)        \
+    INSTANTIATE_BDA_FUNCTIONS(T,1) \
+    INSTANTIATE_BDA_FUNCTIONS(T,2) \
+    INSTANTIATE_BDA_FUNCTIONS(T,3) \
+    INSTANTIATE_BDA_FUNCTIONS(T,4) \
+    INSTANTIATE_BDA_FUNCTIONS(T,5) \
+    INSTANTIATE_BDA_FUNCTIONS(T,6)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
+INSTANTIATE_TYPE(double)

 } // namespace Opm
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -27,7 +27,7 @@
 namespace Opm
 {

-class WellContributions;
+template<class Scalar> class WellContributions;

 typedef Dune::InverseOperatorResult InverseOperatorResult;

@ -36,12 +36,13 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 class BdaBridge
 {
 private:
+    using Scalar = typename BridgeVector::field_type;
    int verbosity = 0;
    bool use_gpu = false;
    std::string accelerator_mode;
-    std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
-    std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
-    std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::unique_ptr<Accelerator::BdaSolver<Scalar,block_size>> backend;
+    std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
    std::vector<int> h_rows, h_cols;  // store the sparsity pattern of the matrix
    std::vector<int> h_jacRows, h_jacCols;  // store the sparsity pattern of the jacMatrix
    std::vector<typename BridgeMatrix::size_type> diagIndices;   // contains offsets of the diagonal blocks wrt start of the row, used for replaceZeroDiagonal()
@ -57,8 +58,14 @@ public:
    /// \param[in] deviceID                   the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
    /// \param[in] opencl_ilu_parallel        whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
    /// \param[in] linsolver                  indicating the preconditioner, equal to the --linear-solver cmdline argument
-    BdaBridge(std::string accelerator_mode, int linear_solver_verbosity, int maxit, double tolerance,
-        unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
+    BdaBridge(std::string accelerator_mode,
+              int linear_solver_verbosity,
+              int maxit,
+              Scalar tolerance,
+              unsigned int platformID,
+              unsigned int deviceID,
+              bool opencl_ilu_parallel,
+              std::string linsolver);


    /// Solve linear system, A*x = b
@ -69,7 +76,12 @@ public:
    /// \param[in] b               vector b, should be of type Dune::BlockVector
    /// \param[in] wellContribs    contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] result       summary of solver result
-    void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
+    void solve_system(BridgeMatrix* bridgeMat,
+                      BridgeMatrix* jacMat,
+                      int numJacobiBlocks,
+                      BridgeVector& b,
+                      WellContributions<Scalar>& wellContribs,
+                      InverseOperatorResult &result);

    /// Get the resulting x vector
    /// \param[inout] x    vector x, should be of type Dune::BlockVector
@ -77,7 +89,8 @@ public:

    /// Return whether the BdaBridge will use the GPU or not
    /// return whether the BdaBridge will use the GPU or not
-    bool getUseGpu(){
+    bool getUseGpu()
+    {
        return use_gpu;
    }

@ -85,19 +98,21 @@ public:
    /// \param[in] mat       input matrix, probably BCRSMatrix
    /// \param[out] h_rows   rowpointers
    /// \param[out] h_cols   columnindices
-    static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
+    static void copySparsityPatternFromISTL(const BridgeMatrix& mat,
+                                            std::vector<int>& h_rows,
+                                            std::vector<int>& h_cols);

    /// Initialize the WellContributions object with opencl context and queue
    /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
    /// \param[in] wellContribs   container to hold all WellContributions
    /// \param[in] N              number of rows in scalar vector that wellContribs will be applied on
-    void initWellContributions(WellContributions& wellContribs, unsigned N);
+    void initWellContributions(WellContributions<Scalar>& wellContribs, unsigned N);

    /// Return the selected accelerator mode, this is input via the command-line
-    std::string getAccleratorName(){
+    std::string getAccleratorName()
+    {
        return accelerator_mode;
    }
-
 }; // end class BdaBridge

 }
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -25,70 +25,86 @@
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

 #include <memory>
-#include <string>

 namespace Opm {

-class WellContributions;
+template<class Scalar> class WellContributions;

 namespace Accelerator {
-    enum class SolverStatus {
-        BDA_SOLVER_SUCCESS,
-        BDA_SOLVER_ANALYSIS_FAILED,
-        BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
-        BDA_SOLVER_UNKNOWN_ERROR
-    };

-    /// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
-    /// This class is abstract, no instantiations can of it can be made, only of its children
-    template <unsigned int block_size>
-    class BdaSolver
-    {
+enum class SolverStatus {
+    BDA_SOLVER_SUCCESS,
+    BDA_SOLVER_ANALYSIS_FAILED,
+    BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
+    BDA_SOLVER_UNKNOWN_ERROR
+};

-    protected:
+/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
+/// This class is abstract, no instantiations can of it can be made, only of its children
+template<class Scalar, unsigned int block_size>
+class BdaSolver
+{
+protected:
+    // verbosity
+    // 0: print nothing during solves, only when initializing
+    // 1: print number of iterations and final norm
+    // 2: also print norm each iteration
+    // 3: also print timings of different backend functions
+    int verbosity = 0;

-        // verbosity
-        // 0: print nothing during solves, only when initializing
-        // 1: print number of iterations and final norm
-        // 2: also print norm each iteration
-        // 3: also print timings of different backend functions
+    int maxit = 200;
+    Scalar tolerance = 1e-2;

-        int verbosity = 0;
+    int N;           // number of rows
+    int Nb;          // number of blocked rows (Nb*block_size == N)
+    int nnz;         // number of nonzeroes (scalars)
+    int nnzb;        // number of nonzero blocks (nnzb*block_size*block_size == nnz)

-        int maxit = 200;
-        double tolerance = 1e-2;
+    unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
+    unsigned int deviceID = 0;   // ID of the device to be used

-        int N;           // number of rows
-        int Nb;          // number of blocked rows (Nb*block_size == N)
-        int nnz;         // number of nonzeroes (scalars)
-        int nnzb;        // number of nonzero blocks (nnzb*block_size*block_size == nnz)
+    bool initialized = false;

-        unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
-        unsigned int deviceID = 0;   // ID of the device to be used
+public:
+    /// Construct a BdaSolver
+    /// \param[in] linear_solver_verbosity    verbosity of solver
+    /// \param[in] maxit                      maximum number of iterations for solver
+    /// \param[in] tolerance                  required relative tolerance for solver
+    /// \param[in] platformID                 the OpenCL platform to be used, only used in openclSolver
+    /// \param[in] deviceID                   the device to be used
+    BdaSolver(int linear_solver_verbosity, int max_it, Scalar tolerance_)
+        : verbosity(linear_solver_verbosity)
+        , maxit(max_it)
+        , tolerance(tolerance_)
+    {}
+    BdaSolver(int linear_solver_verbosity, int max_it,
+              Scalar tolerance_, unsigned int deviceID_)
+        : verbosity(linear_solver_verbosity)
+        , maxit(max_it)
+        , tolerance(tolerance_)
+        , deviceID(deviceID_) {};
+    BdaSolver(int linear_solver_verbosity, int max_it,
+              double tolerance_, unsigned int platformID_,
+              unsigned int deviceID_)
+        : verbosity(linear_solver_verbosity)
+        , maxit(max_it)
+        , tolerance(tolerance_)
+        , platformID(platformID_)
+        , deviceID(deviceID_)
+    {}

-        bool initialized = false;
+    /// Define virtual destructor, so that the derivedclass destructor will be called
+    virtual ~BdaSolver() = default;

-    public:
-        /// Construct a BdaSolver
-        /// \param[in] linear_solver_verbosity    verbosity of solver
-        /// \param[in] maxit                      maximum number of iterations for solver
-        /// \param[in] tolerance                  required relative tolerance for solver
-        /// \param[in] platformID                 the OpenCL platform to be used, only used in openclSolver
-        /// \param[in] deviceID                   the device to be used
-        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
-        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
-        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
+    /// Define as pure virtual functions, so derivedclass must implement them
+    virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                                      Scalar* b,
+                                      std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                                      WellContributions<Scalar>& wellContribs,
+                                      BdaResult& res) = 0;

-        /// Define virtual destructor, so that the derivedclass destructor will be called
-        virtual ~BdaSolver() {};
-
-        /// Define as pure virtual functions, so derivedclass must implement them
-        virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-            std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
-
-        virtual void get_result(double *x) = 0;
-
-    }; // end class BdaSolver
+    virtual void get_result(Scalar* x) = 0;
+}; // end class BdaSolver

 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -17,9 +17,6 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <cstring>
-#include <cmath>
-
 #include <config.h>

 #include <opm/common/OpmLog/OpmLog.hpp>
@ -29,16 +26,10 @@
 #include <opm/simulators/linalg/bda/Matrix.hpp>
 #include <opm/simulators/linalg/bda/Matrix.hpp>

-namespace Opm
+namespace Opm::Accelerator {
+
+void sortRow(int *colIndices, int *data, int left, int right)
 {
-namespace Accelerator
-{
-
-using Opm::OpmLog;
-
-
-
-void sortRow(int *colIndices, int *data, int left, int right) {
    int l = left;
    int r = right;
    int middle = colIndices[(l + r) >> 1];
@ -67,14 +58,14 @@ void sortRow(int *colIndices, int *data, int left, int right) {
        sortRow(colIndices, data, l, right);
 }

-
 // LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
 // a = a - (b * c)
-void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
+template<class Scalar>
+void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size)
 {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
-            double temp = 0.0;
+            Scalar temp = 0.0;
            for (unsigned int k = 0; k < block_size; k++) {
                temp += b[block_size * row + k] * c[block_size * k + col];
            }
@ -84,11 +75,12 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
 }

 /*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
-
-void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
+template<class Scalar>
+void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size)
+{
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
-            double temp = 0;
+            Scalar temp = 0;
            for (unsigned int k = 0; k < block_size; k++) {
                temp += mat1[block_size * row + k] * mat2[block_size * k + col];
            }
@ -97,5 +89,10 @@ void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_si
    }
 }

-} // namespace Accelerator
-} // namespace Opm
+#define INSTANCE_TYPE(T) \
+    template void blockMultSub(double*, double*, double*, unsigned int); \
+    template void blockMult(double*, double*, double*, unsigned int);
+
+INSTANCE_TYPE(double)
+
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -20,44 +20,40 @@
 #ifndef OPM_BLOCKED_MATRIX_HPP
 #define OPM_BLOCKED_MATRIX_HPP

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
+template<class Scalar>
 class BlockedMatrix
 {
-
 public:
-
    /// Allocate BlockedMatrix and data arrays with given sizes
    /// \param[in] Nb               number of blockrows
    /// \param[in] nnzbs            number of nonzero blocks
    /// \param[in] block_size       the number of rows and columns for each block
    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
-    : nnzValues(new double[nnzbs_*block_size_*block_size_]),
-      colIndices(new int[nnzbs_*block_size_*block_size_]),
-      rowPointers(new int[Nb_+1]),
-      Nb(Nb_),
-      nnzbs(nnzbs_),
-      block_size(block_size_),
-      deleteNnzs(true),
-      deleteSparsity(true)
+        : nnzValues(new Scalar[nnzbs_*block_size_*block_size_])
+        , colIndices(new int[nnzbs_*block_size_*block_size_])
+        , rowPointers(new int[Nb_+1])
+        , Nb(Nb_)
+        , nnzbs(nnzbs_)
+        , block_size(block_size_)
+        , deleteNnzs(true)
+        , deleteSparsity(true)
    {}

    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
    /// \param[in] M              matrix to be copied
    BlockedMatrix(const BlockedMatrix& M)
-    : nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
-      colIndices(M.colIndices),
-      rowPointers(M.rowPointers),
-      Nb(M.Nb),
-      nnzbs(M.nnzbs),
-      block_size(M.block_size),
-      deleteNnzs(true),
-      deleteSparsity(false)
+        : nnzValues(new Scalar[M.nnzbs*M.block_size*M.block_size])
+        , colIndices(M.colIndices)
+        , rowPointers(M.rowPointers)
+        , Nb(M.Nb)
+        , nnzbs(M.nnzbs)
+        , block_size(M.block_size)
+        , deleteNnzs(true)
+        , deleteSparsity(false)
    {}

    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
@ -67,18 +63,20 @@ public:
    /// \param[in] nnzValues      array of nonzero values, contains nnzb*block_size*block_size scalars
    /// \param[in] colIndices     array of column indices, contains nnzb entries
    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
-    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
-    : nnzValues(nnzValues_),
-      colIndices(colIndices_),
-      rowPointers(rowPointers_),
-      Nb(Nb_),
-      nnzbs(nnzbs_),
-      block_size(block_size_),
-      deleteNnzs(false),
-      deleteSparsity(false)
+    BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_,
+                  Scalar* nnzValues_, int *colIndices_, int *rowPointers_)
+        : nnzValues(nnzValues_)
+        , colIndices(colIndices_)
+        , rowPointers(rowPointers_)
+        , Nb(Nb_)
+        , nnzbs(nnzbs_)
+        , block_size(block_size_)
+        , deleteNnzs(false)
+        , deleteSparsity(false)
    {}

-    ~BlockedMatrix(){
+    ~BlockedMatrix()
+    {
        if (deleteNnzs) {
            delete[] nnzValues;
        }
@ -88,8 +86,7 @@ public:
        }
    }

-
-    double *nnzValues;
+    Scalar* nnzValues;
    int *colIndices;
    int *rowPointers;
    int Nb;
@ -99,14 +96,13 @@ public:
    bool deleteSparsity;
 };

-
 /// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
 /// These ints aren't actually nonzeroes, but represent a mapping used later
 /// \param[inout] colIndices     represent keys in sorting
 /// \param[inout] data           sorted according to the colIndices
 /// \param[in] left              lower index of data of row
 /// \param[in] right             upper index of data of row
-void sortRow(int *colIndices, int *data, int left, int right);
+void sortRow(int* colIndices, int* data, int left, int right);

 /// Multiply and subtract blocks
 /// a = a - (b * c)
@ -114,7 +110,8 @@ void sortRow(int *colIndices, int *data, int left, int right);
 /// \param[in] b                 input block
 /// \param[in] c                 input block
 /// \param[in] block_size        size of block
-void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
+template<class Scalar>
+void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size);

 /// Perform a matrix-matrix multiplication on two blocks
 /// resMat = mat1 * mat2
@ -122,9 +119,9 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
 /// \param[in] mat2              input block 2
 /// \param[out] resMat           output block
 /// \param[in] block_size        size of block
-void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);
+template<class Scalar>
+void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size);

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
--- a/opm/simulators/linalg/bda/Matrix.hpp
+++ b/opm/simulators/linalg/bda/Matrix.hpp
@ -29,17 +29,17 @@ namespace Accelerator

 /// This struct resembles a csr matrix, only doubles are supported
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
-class Matrix {
-
+template<class Scalar>
+class Matrix
+{
 public:
-
    /// Allocate square Matrix and data arrays with given sizes
    /// \param[in] N               number of rows
    /// \param[in] nnzs            number of nonzeros
    Matrix(int N_, int nnzs_)
-    : N(N_),
-      M(N_),
-      nnzs(nnzs_)
+        : N(N_)
+        , M(N_)
+        , nnzs(nnzs_)
    {
        nnzValues.resize(nnzs);
        colIndices.resize(nnzs);
@ -51,12 +51,12 @@ public:
    /// \param[in] M               number of columns
    /// \param[in] nnzs            number of nonzeros
    Matrix(int N_, int M_, int nnzs_)
-    : Matrix(N_, nnzs_)
+        : Matrix(N_, nnzs_)
    {
        M = M_;
    }

-    std::vector<double> nnzValues;
+    std::vector<Scalar> nnzValues;
    std::vector<int> colIndices;
    std::vector<int> rowPointers;
    int N, M;
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
@ -29,21 +29,27 @@
 namespace Opm
 {

-MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
-        unsigned int Mb_,
-        std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-        unsigned int DnumBlocks_, double *Dvalues, UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
-        std::vector<double> &Cvalues)
-    :
-    dim(dim_),                // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
-    dim_wells(dim_wells_),    // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
-    M(Mb_ * dim_wells),       // number of rows, M == dim_wells*Mb
-    Mb(Mb_),                  // number of blockrows in C, D and B
-    DnumBlocks(DnumBlocks_),  // number of blocks in D
+template<class Scalar>
+MultisegmentWellContribution<Scalar>::
+MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
+                             unsigned int Mb_,
+                             std::vector<Scalar>& Bvalues,
+                             std::vector<unsigned int>& BcolIndices,
+                             std::vector<unsigned int>& BrowPointers,
+                             unsigned int DnumBlocks_,
+                             Scalar* Dvalues,
+                             UMFPackIndex* DcolPointers,
+                             UMFPackIndex* DrowIndices,
+                             std::vector<Scalar>& Cvalues)
+    : dim(dim_)                // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
+    , dim_wells(dim_wells_)    // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
+    , M(Mb_ * dim_wells)       // number of rows, M == dim_wells*Mb
+    , Mb(Mb_)                  // number of blockrows in C, D and B
+    , DnumBlocks(DnumBlocks_)  // number of blocks in D
    // copy data for matrix D into vectors to prevent it going out of scope
-    Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells),
-    Dcols(DcolPointers, DcolPointers + M + 1),
-    Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
+    , Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells)
+    , Dcols(DcolPointers, DcolPointers + M + 1)
+    , Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
 {
    Cvals = std::move(Cvalues);
    Bvals = std::move(Bvalues);
@ -57,17 +63,18 @@ MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, un
    umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
 }

-MultisegmentWellContribution::~MultisegmentWellContribution()
+template<class Scalar>
+MultisegmentWellContribution<Scalar>::~MultisegmentWellContribution()
 {
    umfpack_di_free_symbolic(&UMFPACK_Symbolic);
    umfpack_di_free_numeric(&UMFPACK_Numeric);
 }

-
 // Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
 // h_x and h_y reside on host
 // y -= (C^T * (D^-1 * (B * x)))
-void MultisegmentWellContribution::apply(double *h_x, double *h_y)
+template<class Scalar>
+void MultisegmentWellContribution<Scalar>::apply(Scalar* h_x, Scalar* h_y)
 {
    OPM_TIMEBLOCK(apply);
    // reset z1 and z2
@ -80,7 +87,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
        for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
            unsigned int colIdx = Bcols[blockID];
            for (unsigned int j = 0; j < dim_wells; ++j) {
-                double temp = 0.0;
+                Scalar temp = 0.0;
                for (unsigned int k = 0; k < dim; ++k) {
                    temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
                }
@ -100,7 +107,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
        for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
            unsigned int colIdx = Bcols[blockID];
            for (unsigned int j = 0; j < dim; ++j) {
-                double temp = 0.0;
+                Scalar temp = 0.0;
                for (unsigned int k = 0; k < dim_wells; ++k) {
                    temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
                }
@ -111,11 +118,14 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
 }

 #if HAVE_CUDA
-void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
+template<class Scalar>
+void MultisegmentWellContribution<Scalar>::setCudaStream(cudaStream_t stream_)
 {
    stream = stream_;
 }
 #endif

+template class MultisegmentWellContribution<double>;
+
 } //namespace Opm

--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@ -41,6 +41,7 @@ namespace Opm
 /// B*x and D*B*x are a vector with M*numWellEq doubles
 /// C*D*B*x is a vector with N*numEq doubles.

+template<class Scalar>
 class MultisegmentWellContribution
 {

@ -57,15 +58,15 @@ private:
    // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
    // Sparsity pattern for C is not stored, since it is the same as B
    unsigned int DnumBlocks;             // number of blocks in D
-    std::vector<double> Cvals;
-    std::vector<double> Dvals;
-    std::vector<double> Bvals;
+    std::vector<Scalar> Cvals;
+    std::vector<Scalar> Dvals;
+    std::vector<Scalar> Bvals;
    std::vector<int> Dcols;              // Columnpointers, contains M+1 entries
    std::vector<unsigned int> Bcols;
    std::vector<int> Drows;              // Rowindicies, contains DnumBlocks*dim*dim_wells entries
    std::vector<unsigned int> Brows;
-    std::vector<double> z1;          // z1 = B * x
-    std::vector<double> z2;          // z2 = D^-1 * B * x
+    std::vector<Scalar> z1;          // z1 = B * x
+    std::vector<Scalar> z2;          // z2 = D^-1 * B * x
    void *UMFPACK_Symbolic, *UMFPACK_Numeric;

    /// Translate the columnIndex if needed
@ -97,9 +98,14 @@ public:
    /// \param[in] Cvalues          nonzero values of matrix C
    MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
                                 unsigned int Mb,
-                                 std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-                                 unsigned int DnumBlocks, double *Dvalues, UMFPackIndex *DcolPointers,
-                                 UMFPackIndex *DrowIndices, std::vector<double> &Cvalues);
+                                 std::vector<Scalar>& Bvalues,
+                                 std::vector<unsigned int>& BcolIndices,
+                                 std::vector<unsigned int>& BrowPointers,
+                                 unsigned int DnumBlocks,
+                                 Scalar* Dvalues,
+                                 UMFPackIndex* DcolPointers,
+                                 UMFPackIndex* DrowIndices,
+                                 std::vector<Scalar>& Cvalues);

    /// Destroy a MultisegmentWellContribution, and free memory
    ~MultisegmentWellContribution();
@ -108,7 +114,7 @@ public:
    /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
    /// \param[in] h_x          vector x, must be on CPU
    /// \param[inout] h_y       vector y, must be on CPU
-    void apply(double *h_x, double *h_y);
+    void apply(Scalar* h_x, Scalar* h_y);
 };

 } //namespace Opm
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -39,35 +39,36 @@

 namespace Opm {

-WellContributions::~WellContributions() = default;
+template<class Scalar>
+WellContributions<Scalar>::~WellContributions() = default;

-std::unique_ptr<WellContributions>
-WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
+template<class Scalar>
+std::unique_ptr<WellContributions<Scalar>>
+WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useWellConn)
 {
-    if(accelerator_mode.compare("cusparse") == 0){
+    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
-    return std::make_unique<WellContributionsCuda>();
+        return std::make_unique<WellContributionsCuda<Scalar>>();
 #else
-    OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
+        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
 #endif
    }
-    else if(accelerator_mode.compare("opencl") == 0){
+    else if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
-        return std::make_unique<WellContributionsOCL>();
+        return std::make_unique<WellContributionsOCL<Scalar>>();
 #else
        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
 #endif
    }
-    else if(accelerator_mode.compare("rocsparse") == 0){
+    else if (accelerator_mode.compare("rocsparse") == 0) {
        if (!useWellConn) {
 #if HAVE_ROCSPARSE
-            return std::make_unique<WellContributionsRocsparse>();
+            return std::make_unique<WellContributionsRocsparse<Scalar>>();
 #else
        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
 #endif
        }
        return std::make_unique<WellContributions>();
-
    }
    else if(accelerator_mode.compare("amgcl") == 0){
        if (!useWellConn) {
@ -86,10 +87,12 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
    }
 }

-void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
-                                  [[maybe_unused]] int* colIndices,
-                                  [[maybe_unused]] double* values,
-                                  [[maybe_unused]] unsigned int val_size)
+template<class Scalar>
+void WellContributions<Scalar>::
+addMatrix([[maybe_unused]] MatrixType type,
+          [[maybe_unused]] int* colIndices,
+          [[maybe_unused]] Scalar* values,
+          [[maybe_unused]] unsigned int val_size)
 {
 #if !HAVE_CUDA && !HAVE_OPENCL
    OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
@ -107,7 +110,8 @@ void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
    }
 }

-void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
+template<class Scalar>
+void WellContributions<Scalar>::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
 {
    dim = dim_;
    dim_wells = dim_wells_;
@ -121,11 +125,14 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
    }
 }

-void WellContributions::setVectorSize(unsigned N_) {
+template<class Scalar>
+void WellContributions<Scalar>::setVectorSize(unsigned N_)
+{
    N = N_;
 }

-void WellContributions::addNumBlocks(unsigned int numBlocks)
+template<class Scalar>
+void WellContributions<Scalar>::addNumBlocks(unsigned int numBlocks)
 {
    if (allocated) {
        OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
@ -134,7 +141,8 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
    num_std_wells++;
 }

-void WellContributions::alloc()
+template<class Scalar>
+void WellContributions<Scalar>::alloc()
 {
    if (num_std_wells > 0) {
        val_pointers.resize(num_std_wells+1);
@ -144,31 +152,36 @@ void WellContributions::alloc()
    }
 }

-void WellContributions::addMultisegmentWellContribution(unsigned int dim_,
-                                                        unsigned int dim_wells_,
-                                                        unsigned int Mb,
-                                                        std::vector<double>& Bvalues,
-                                                        std::vector<unsigned int>& BcolIndices,
-                                                        std::vector<unsigned int>& BrowPointers,
-                                                        unsigned int DnumBlocks,
-                                                        double* Dvalues,
-                                                        UMFPackIndex* DcolPointers,
-                                                        UMFPackIndex* DrowIndices,
-                                                        std::vector<double>& Cvalues)
+template<class Scalar>
+void WellContributions<Scalar>::
+addMultisegmentWellContribution(unsigned int dim_,
+                                unsigned int dim_wells_,
+                                unsigned int Mb,
+                                std::vector<Scalar>& Bvalues,
+                                std::vector<unsigned int>& BcolIndices,
+                                std::vector<unsigned int>& BrowPointers,
+                                unsigned int DnumBlocks,
+                                Scalar* Dvalues,
+                                UMFPackIndex* DcolPointers,
+                                UMFPackIndex* DrowIndices,
+                                std::vector<Scalar>& Cvalues)
 {
    assert(dim==dim_);
-    multisegments.push_back(std::make_unique<MultisegmentWellContribution>(dim_,
-                                                                           dim_wells_,
-                                                                           Mb,
-                                                                           Bvalues,
-                                                                           BcolIndices,
-                                                                           BrowPointers,
-                                                                           DnumBlocks,
-                                                                           Dvalues,
-                                                                           DcolPointers,
-                                                                           DrowIndices,
-                                                                           Cvalues));
+    using MSW = MultisegmentWellContribution<Scalar>;
+    multisegments.push_back(std::make_unique<MSW>(dim_,
+                                                  dim_wells_,
+                                                  Mb,
+                                                  Bvalues,
+                                                  BcolIndices,
+                                                  BrowPointers,
+                                                  DnumBlocks,
+                                                  Dvalues,
+                                                  DcolPointers,
+                                                  DrowIndices,
+                                                  Cvalues));
    ++num_ms_wells;
 }

+template class WellContributions<double>;
+
 } //namespace Opm
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -30,7 +30,7 @@

 namespace Opm {

-class MultisegmentWellContribution;
+template<class Scalar> class MultisegmentWellContribution;

 /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver or openclSolver.
 /// If the --matrix-add-well-contributions commandline parameter is true, this class should still be used, but be empty.
@ -48,6 +48,7 @@ class MultisegmentWellContribution;
 /// - get total size of all wellcontributions that must be stored here
 /// - allocate memory
 /// - copy data of wellcontributions
+template<class Scalar>
 class WellContributions
 {
 public:
@ -74,7 +75,7 @@ protected:
    unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
    std::vector<unsigned int> val_pointers;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols

-    std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;
+    std::vector<std::unique_ptr<MultisegmentWellContribution<Scalar>>> multisegments;

 public:
    unsigned int getNumWells(){
@ -105,7 +106,7 @@ public:
    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
    /// \param[in] values      array of nonzeroes
    /// \param[in] val_size    number of blocks in C or B, ignored for D
-    void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
+    void addMatrix(MatrixType type, int* colIndices, Scalar* values, unsigned int val_size);

    /// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
    /// Matrices C and B are passed in Blocked CSR, matrix D in CSC
@ -120,19 +121,25 @@ public:
    /// \param[in] DcolPointers     columnpointers of matrix D
    /// \param[in] DrowIndices      rowindices of matrix D
    /// \param[in] Cvalues          nonzero values of matrix C
-    void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+    void addMultisegmentWellContribution(unsigned int dim,
+                                         unsigned int dim_wells,
                                         unsigned int Mb,
-                                         std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-                                         unsigned int DnumBlocks, double *Dvalues,
-                                         UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
-                                         std::vector<double> &Cvalues);
+                                         std::vector<Scalar>& Bvalues,
+                                         std::vector<unsigned int>& BcolIndices,
+                                         std::vector<unsigned int>& BrowPointers,
+                                         unsigned int DnumBlocks,
+                                         Scalar* Dvalues,
+                                         UMFPackIndex* DcolPointers,
+                                         UMFPackIndex* DrowIndices,
+                                         std::vector<Scalar>& Cvalues);
 protected:
    //! \brief API specific allocation.
    virtual void APIalloc() {}

    /// Api specific upload of matrix.
-    virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
+    virtual void APIaddMatrix(MatrixType, int*, Scalar*, unsigned int) {}
 };
+
 } //namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@ -46,36 +46,35 @@
 #include <tuple>
 #include <vector>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-amgclSolverBackend<block_size>::amgclSolverBackend(const int          verbosity_,
-                                                   const int          maxit_,
-                                                   const double       tolerance_,
-                                                   const unsigned int platformID_,
-                                                   const unsigned int deviceID_)
-    : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
+template<class Scalar, unsigned int block_size>
+amgclSolverBackend<Scalar,block_size>::
+amgclSolverBackend(const int          verbosity_,
+                   const int          maxit_,
+                   const Scalar       tolerance_,
+                   const unsigned int platformID_,
+                   const unsigned int deviceID_)
+    : Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
 {}

-template <unsigned int block_size>
-amgclSolverBackend<block_size>::~amgclSolverBackend() {}
+template<class Scalar, unsigned int block_size>
+amgclSolverBackend<Scalar,block_size>::~amgclSolverBackend()
+{}

-
-template <unsigned int block_size>
-void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
+template<class Scalar, unsigned int block_size>
+void amgclSolverBackend<Scalar,block_size>::initialize(int Nb_, int nnzbs)
+{
    this->Nb = Nb_;
    this->N = Nb * block_size;
    this->nnzb = nnzbs;
    this->nnz = nnzbs * block_size * block_size;

    std::ostringstream out;
-    out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
+    out << "Initializing amgclSolverBackend, matrix size: " << Nb
+        << " blockrows, nnzb: " << nnzb << " blocks\n";
    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
    out << "DeviceID: " << deviceID << "\n";
    OpmLog::info(out.str());
@ -118,7 +117,8 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
        prm.put("solver.maxiter", t3);
        bool t4 = prm.get("solver.verbose", verbosity >= 2);
        prm.put("solver.verbose", t4);
-        out << "Using parameters from " << filename << " (with default values for omitted parameters):\n";
+        out << "Using parameters from " << filename
+            << " (with default values for omitted parameters):\n";
    } else { // otherwise use default parameters, same as Dune
        prm.put("backend_type", "cpu"); // put it in the tree so it gets printed
        prm.put("precond.class", "relaxation");
@ -142,7 +142,8 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
    } else if (backend_type_string == "vexcl") {
        backend_type = Amgcl_backend_type::vexcl;
    } else {
-        OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
+        OPM_THROW(std::logic_error,
+                  "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
    }

    if (backend_type == Amgcl_backend_type::cuda) {
@ -160,9 +161,10 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
    initialized = true;
 } // end initialize()

-
-template <unsigned int block_size>
-void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *cols) {
+template<class Scalar, unsigned int block_size>
+void amgclSolverBackend<Scalar,block_size>::
+convert_sparsity_pattern(int* rows, int* cols)
+{
    Timer t;
    const unsigned int bs = block_size;
    int idx = 0; // indicates the unblocked write index
@ -189,9 +191,10 @@ void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *co
    }
 } // end convert_sparsity_pattern()

-
-template <unsigned int block_size>
-void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
+template<class Scalar, unsigned int block_size>
+void amgclSolverBackend<Scalar,block_size>::
+convert_data(Scalar* vals, int* rows)
+{
    Timer t;
    const unsigned int bs = block_size;
    int idx = 0; // indicates the unblocked write index
@ -217,7 +220,9 @@ void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
 } // end convert_data()

 #if HAVE_VEXCL
-void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformID, unsigned int deviceID) {
+void initialize_vexcl(std::vector<cl::CommandQueue>& ctx,
+                      unsigned int platformID, unsigned int deviceID)
+{
    std::vector<cl::Platform> platforms;
    std::vector<cl::Device> devices;
    cl::Platform::get(&platforms);
@ -245,19 +250,20 @@ void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformI
    OpmLog::info(out.str());
 }

-template <typename vexcl_matrix_type, typename vexcl_vector_type, unsigned int block_size, typename AIJInfo>
-void solve_vexcl(
-    const AIJInfo& A,
-    const boost::property_tree::ptree prm,
-    const std::vector<cl::CommandQueue>& ctx,
-    double *b,
-    std::vector<double>& x,
-    const int N,
-    int& iters,
-    double& error)
+template <typename vexcl_matrix_type, typename vexcl_vector_type,
+          unsigned int block_size, typename Scalar, typename AIJInfo>
+void solve_vexcl(const AIJInfo& A,
+                 const boost::property_tree::ptree prm,
+                 const std::vector<cl::CommandQueue>& ctx,
+                 Scalar* b,
+                 std::vector<Scalar>& x,
+                 const int N,
+                 int& iters,
+                 Scalar& error)
 {
-    typedef amgcl::backend::vexcl<vexcl_matrix_type> Backend;
-    typedef amgcl::make_solver<amgcl::runtime::preconditioner<Backend>, amgcl::runtime::solver::wrapper<Backend> > Solver;
+    using Backend = amgcl::backend::vexcl<vexcl_matrix_type>;
+    using Solver = amgcl::make_solver<amgcl::runtime::preconditioner<Backend>,
+                                      amgcl::runtime::solver::wrapper<Backend>>;

    typename Solver::backend_params bprm;
    bprm.q = ctx;  // set vexcl context
@ -275,8 +281,10 @@ void solve_vexcl(
 }
 #endif

-template <unsigned int block_size>
-void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
+template<class Scalar, unsigned int block_size>
+void amgclSolverBackend<Scalar,block_size>::
+solve_system(Scalar* b, BdaResult& res)
+{
    Timer t;

    try {
@ -306,7 +314,7 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
                // reset x vector
                std::fill(x.begin(), x.end(), 0.0);

-                std::vector<double> b_(b, b + N);
+                std::vector<Scalar> b_(b, b + N);

                // create numa vectors
                typename CPU_Backend::params bprm;
@ -349,10 +357,11 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
            if constexpr(block_size == 1){
                auto A = std::tie(N, A_rows, A_cols, A_vals);

-                solve_vexcl<double, double, block_size>(A, prm, ctx, b, x, N, iters, error);
+                solve_vexcl<Scalar, Scalar, block_size>(A, prm, ctx, b, x, N, iters, error);
            } else {
                // allow vexcl to use blocked matrices
-                vex::scoped_program_header h1(ctx, amgcl::backend::vexcl_static_matrix_declaration<double, block_size>());
+                vex::scoped_program_header h1(ctx,
+                                              amgcl::backend::vexcl_static_matrix_declaration<Scalar, block_size>());

                auto Atmp = std::tie(N, A_rows, A_cols, A_vals);
                auto A = amgcl::adapter::block_matrix<dmat_type>(Atmp);
@ -375,8 +384,8 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {

    if (verbosity >= 1) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
+        out << "=== converged: " << res.converged << ", time: " << res.elapsed
+            << ", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
        OpmLog::info(out.str());
    }
    if (verbosity >= 3) {
@ -384,14 +393,13 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
        out << "amgclSolverBackend::solve_system(): " << time_elapsed << " s";
        OpmLog::info(out.str());
    }
-
 } // end solve_system()

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void amgclSolverBackend<block_size>::get_result(double *x_) {
+template<class Scalar, unsigned int block_size>
+void amgclSolverBackend<Scalar,block_size>::get_result(Scalar* x_)
+{
    Timer t;

    std::copy(x.begin(), x.end(), x_);
@ -403,13 +411,13 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
    }
 } // end get_result()

-
-template <unsigned int block_size>
-SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                          double *b,
-                                                          [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                          [[maybe_unused]] WellContributions& wellContribs,
-                                                          BdaResult &res)
+template<class Scalar, unsigned int block_size>
+SolverStatus amgclSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             [[maybe_unused]] std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             [[maybe_unused]] WellContributions<Scalar>& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix->Nb, matrix->nnzbs);
@ -420,15 +428,14 @@ SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<Blocke
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

-#define INSTANTIATE_TYPE(T)               \
-    template class amgclSolverBackend<1>; \
-    template class amgclSolverBackend<2>; \
-    template class amgclSolverBackend<3>; \
-    template class amgclSolverBackend<4>; \
-    template class amgclSolverBackend<5>; \
-    template class amgclSolverBackend<6>;
+#define INSTANTIATE_TYPE(T)                 \
+    template class amgclSolverBackend<T,1>; \
+    template class amgclSolverBackend<T,2>; \
+    template class amgclSolverBackend<T,3>; \
+    template class amgclSolverBackend<T,4>; \
+    template class amgclSolverBackend<T,5>; \
+    template class amgclSolverBackend<T,6>;

 INSTANTIATE_TYPE(double)

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@ -41,17 +41,14 @@
 #include <type_traits>
 #include <vector>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving
 /// Note amgcl also implements blocked solvers, but looks like it needs unblocked input data
-template <unsigned int block_size>
-class amgclSolverBackend : public BdaSolver<block_size>
+template<class Scalar, unsigned int block_size>
+class amgclSolverBackend : public BdaSolver<Scalar,block_size>
 {
-    typedef BdaSolver<block_size> Base;
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -64,17 +61,16 @@ class amgclSolverBackend : public BdaSolver<block_size>
    using Base::tolerance;
    using Base::initialized;

-    using dmat_type = amgcl::static_matrix<double, block_size, block_size>; // matrix value type in double precision
-    using dvec_type = amgcl::static_matrix<double, block_size, 1>; // the corresponding vector value type
+    using dmat_type = amgcl::static_matrix<Scalar, block_size, block_size>; // matrix value type in double precision
+    using dvec_type = amgcl::static_matrix<Scalar, block_size, 1>; // the corresponding vector value type
    using CPU_Backend = std::conditional_t<block_size == 1,
-                                           amgcl::backend::builtin<double>,
+                                           amgcl::backend::builtin<Scalar>,
                                           amgcl::backend::builtin<dmat_type>>;

    using CPU_Solver = amgcl::make_solver<amgcl::runtime::preconditioner<CPU_Backend>,
                                          amgcl::runtime::solver::wrapper<CPU_Backend>>;

 private:
-
    // amgcl can use different backends, this lets the user choose
    enum Amgcl_backend_type {
        cpu,
@ -84,18 +80,18 @@ private:

    // store matrix in CSR format
    std::vector<unsigned> A_rows, A_cols;
-    std::vector<double> A_vals, rhs;
-    std::vector<double> x;
+    std::vector<Scalar> A_vals, rhs;
+    std::vector<Scalar> x;
    std::once_flag print_info;
    Amgcl_backend_type backend_type = cpu;

    boost::property_tree::ptree prm;         // amgcl parameters
    int iters = 0;
-    double error = 0.0;
+    Scalar error = 0.0;

 #if HAVE_CUDA
    std::once_flag cuda_initialize;
-    void solve_cuda(double *b);
+    void solve_cuda(Scalar* b);
 #endif

 #if HAVE_VEXCL
@ -114,21 +110,23 @@ private:
    /// Convert the BCSR nonzero data to a CSR format
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
-    void convert_data(double *vals, int *rows);
+    void convert_data(Scalar* vals, int* rows);

    /// Solve linear system
    /// \param[in] b              pointer to b vector
    /// \param[inout] res         summary of solver result
-    void solve_system(double *b, BdaResult &res);
+    void solve_system(Scalar* b, BdaResult& res);

 public:
-    /// Construct a openclSolver
-    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
-    /// \param[in] maxit                      maximum number of iterations for openclSolver
-    /// \param[in] tolerance                  required relative tolerance for openclSolver
+    /// Construct an amgcl solver
+    /// \param[in] linear_solver_verbosity    verbosity of amgclSolver
+    /// \param[in] maxit                      maximum number of iterations for amgclSolver
+    /// \param[in] tolerance                  required relative tolerance for amgclSolver
    /// \param[in] platformID                 the OpenCL platform to be used
    /// \param[in] deviceID                   the device to be used
-    amgclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
+    amgclSolverBackend(int linear_solver_verbosity, int maxit,
+                       Scalar tolerance, unsigned int platformID,
+                       unsigned int deviceID);

    /// Destroy a openclSolver, and free memory
    ~amgclSolverBackend();
@ -140,18 +138,18 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
+                              BdaResult& res) override;
    
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;

 }; // end class amgclSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
-
-
--- a/opm/simulators/linalg/bda/cuda/amgclSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/amgclSolverBackend.cu
@ -28,18 +28,14 @@

 /// This file is only compiled when both amgcl and CUDA are found by CMake

-namespace Opm
+namespace Opm::Accelerator {
+
+template<class Scalar, unsigned int block_size>
+void amgclSolverBackend<Scalar,block_size>::solve_cuda(Scalar* b)
 {
-namespace Accelerator
-{
-
-using Opm::OpmLog;
-
-
-template <unsigned int block_size>
-void amgclSolverBackend<block_size>::solve_cuda(double *b) {
-    typedef amgcl::backend::cuda<double> CUDA_Backend;
-    typedef amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>, amgcl::runtime::solver::wrapper<CUDA_Backend> > CUDA_Solver;
+    using CUDA_Backend = amgcl::backend::cuda<Scalar>;
+    using CUDA_Solver = amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>,
+                                           amgcl::runtime::solver::wrapper<CUDA_Backend>>;

    static typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle

@ -67,8 +63,8 @@ void amgclSolverBackend<block_size>::solve_cuda(double *b) {
        OpmLog::info(out.str());
    });

-    thrust::device_vector<double> B(b, b + N);
-    thrust::device_vector<double> X(N, 0.0);
+    thrust::device_vector<Scalar> B(b, b + N);
+    thrust::device_vector<Scalar> X(N, 0.0);

    // actually solve
    std::tie(iters, error) = solve(B, X);
@ -76,19 +72,15 @@ void amgclSolverBackend<block_size>::solve_cuda(double *b) {
    thrust::copy(X.begin(), X.end(), x.begin());
 }

+#define INSTANTIATE_TYPE(T)                                \
+    template void amgclSolverBackend<T,1>::solve_cuda(T*); \
+    template void amgclSolverBackend<T,2>::solve_cuda(T*); \
+    template void amgclSolverBackend<T,3>::solve_cuda(T*); \
+    template void amgclSolverBackend<T,4>::solve_cuda(T*); \
+    template void amgclSolverBackend<T,5>::solve_cuda(T*); \
+    template void amgclSolverBackend<T,6>::solve_cuda(T*);

-#define INSTANTIATE_BDA_FUNCTIONS(n)                      \
-template void amgclSolverBackend<n>::solve_cuda(double*); \
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

--- a/opm/simulators/linalg/bda/cuda/cuWellContributions.cu
+++ b/opm/simulators/linalg/bda/cuda/cuWellContributions.cu
@ -33,18 +33,17 @@ namespace Opm
 {

 // apply WellContributions using y -= C^T * (D^-1 * (B * x))
-__global__ void apply_well_contributions(
-    const double * __restrict__ Cnnzs,
-    const double * __restrict__ Dnnzs,
-    const double * __restrict__ Bnnzs,
-    const int * __restrict__ Ccols,
-    const int * __restrict__ Bcols,
-    const double * __restrict__ x,
-    double * __restrict__ y,
-    const int dim,
-    const int dim_wells,
-    const unsigned int * __restrict__ val_pointers
-)
+template<class Scalar>
+__global__ void apply_well_contributions(const Scalar* __restrict__ Cnnzs,
+                                         const Scalar* __restrict__ Dnnzs,
+                                         const Scalar* __restrict__ Bnnzs,
+                                         const int* __restrict__ Ccols,
+                                         const int* __restrict__ Bcols,
+                                         const Scalar* __restrict__ x,
+                                         Scalar* __restrict__ y,
+                                         const int dim,
+                                         const int dim_wells,
+                                         const unsigned int * __restrict__ val_pointers)
 {
    const int idx_b = blockIdx.x;
    const int idx_t = threadIdx.x;
@ -57,9 +56,9 @@ __global__ void apply_well_contributions(
    const int c = lane % dim;                           // col in block
    const int r = (lane / dim) % dim_wells;             // row in block

-    extern __shared__ double smem[];
-    double * __restrict__ z1 = smem;
-    double * __restrict__ z2 = z1 + dim_wells;
+    extern __shared__ unsigned char smem[];
+    Scalar* __restrict__ z1 = reinterpret_cast<Scalar*>(smem);
+    Scalar* __restrict__ z2 = z1 + dim_wells;

    if (idx_t < dim_wells) {
        z1[idx_t] = 0.0;
@ -70,7 +69,7 @@ __global__ void apply_well_contributions(
    // z1 = B * x
    if (idx_t < num_active_threads) {
        // multiply all blocks with x
-        double temp = 0.0;
+        Scalar temp = 0.0;
        int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
        while (b < val_size + val_pointers[idx_b]) {
            int colIdx = Bcols[b];
@ -106,7 +105,7 @@ __global__ void apply_well_contributions(

    // z2 = D^-1 * B * x = D^-1 * z1
    if (idx_t < dim_wells) {
-        double temp = 0.0;
+        Scalar temp = 0.0;
        for (int c = 0; c < dim_wells; ++c) {
            temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
        }
@ -118,7 +117,7 @@ __global__ void apply_well_contributions(
    // y -= C^T * D^-1 * B * x
    // use dim * val_size threads, each block is assigned 'dim' threads
    if (idx_t < dim * val_size) {
-        double temp = 0.0;
+        Scalar temp = 0.0;
        int b = idx_t / dim + val_pointers[idx_b];
        int cc = idx_t % dim;
        int colIdx = Ccols[b];
@ -127,13 +126,13 @@ __global__ void apply_well_contributions(
        }
        y[colIdx * dim + cc] -= temp;
    }
-
 }

-WellContributionsCuda::~WellContributionsCuda()
+template<class Scalar>
+WellContributionsCuda<Scalar>::~WellContributionsCuda()
 {
    // delete data for StandardWell
-    if (num_std_wells > 0) {
+    if (this->num_std_wells > 0) {
        cudaFree(d_Cnnzs);
        cudaFree(d_Dnnzs);
        cudaFree(d_Bnnzs);
@ -142,80 +141,108 @@ WellContributionsCuda::~WellContributionsCuda()
        cudaFree(d_val_pointers);
    }

-    if (num_ms_wells > 0 && h_x) {
+    if (this->num_ms_wells > 0 && h_x) {
        cudaFreeHost(h_x);
        cudaFreeHost(h_y);
        h_x = h_y = nullptr; // Mark as free for constructor
    }
 }

-void WellContributionsCuda::APIalloc()
+template<class Scalar>
+void WellContributionsCuda<Scalar>::APIalloc()
 {
-    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
+    cudaMalloc((void**)&d_Cnnzs,
+               sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
+    cudaMalloc((void**)&d_Dnnzs,
+               sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
+    cudaMalloc((void**)&d_Bnnzs,
+               sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
+    cudaMalloc((void**)&d_Ccols, sizeof(int) * this->num_blocks);
+    cudaMalloc((void**)&d_Bcols, sizeof(int) * this->num_blocks);
+    cudaMalloc((void**)&this->d_val_pointers, sizeof(unsigned int) * (this->num_std_wells + 1));
    cudaCheckLastError("apply_gpu malloc failed");
 }

 // Apply the WellContributions, similar to StandardWell::apply()
 // y -= (C^T *(D^-1*(   B*x)))
-void WellContributionsCuda::apply(double *d_x, double *d_y)
+template<class Scalar>
+void WellContributionsCuda<Scalar>::apply(Scalar* d_x, Scalar* d_y)
 {
    // apply MultisegmentWells

    // make sure the stream is empty if timing measurements are done
    cudaStreamSynchronize(stream);

-    if (num_ms_wells > 0) {
+    if (this->num_ms_wells > 0) {
        // allocate pinned memory on host if not yet done
        if (h_x == nullptr) {
-            cudaMallocHost(&h_x, sizeof(double) * N);
-            cudaMallocHost(&h_y, sizeof(double) * N);
+            cudaMallocHost(&h_x, sizeof(Scalar) * this->N);
+            cudaMallocHost(&h_y, sizeof(Scalar) * this->N);
        }

        // copy vectors x and y from GPU to CPU
-        cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+        cudaMemcpyAsync(h_x, d_x, sizeof(Scalar) * this->N,
+                        cudaMemcpyDeviceToHost, stream);
+        cudaMemcpyAsync(h_y, d_y, sizeof(Scalar) * this->N,
+                        cudaMemcpyDeviceToHost, stream);
        cudaStreamSynchronize(stream);

        // actually apply MultisegmentWells
-        for (auto& well : multisegments) {
+        for (auto& well : this->multisegments) {
            well->apply(h_x, h_y);
        }

        // copy vector y from CPU to GPU
-        cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_y, h_y, sizeof(Scalar) * this->N,
+                        cudaMemcpyHostToDevice, stream);
        cudaStreamSynchronize(stream);
    }

    // apply StandardWells
-    if (num_std_wells > 0) {
-        int smem_size = 2 * sizeof(double) * dim_wells;
-        apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+    if (this->num_std_wells > 0) {
+        int smem_size = 2 * sizeof(Scalar) * this->dim_wells;
+        apply_well_contributions <<< this->num_std_wells, 32, smem_size, stream>>>(d_Cnnzs,
+                                                                                   d_Dnnzs,
+                                                                                   d_Bnnzs,
+                                                                                   d_Ccols,
+                                                                                   d_Bcols,
+                                                                                   d_x,
+                                                                                   d_y,
+                                                                                   this->dim,
+                                                                                   this->dim_wells,
+                                                                                   this->d_val_pointers);
    }
 }

-
-void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+template<class Scalar>
+void WellContributionsCuda<Scalar>::APIaddMatrix(MatrixType type, int* colIndices,
+                                                 Scalar* values, unsigned int val_size)
 {
    switch (type) {
    case MatrixType::C:
-        cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Cnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
+                   values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Ccols + this->num_blocks_so_far, colIndices,
+                   sizeof(int) * val_size, cudaMemcpyHostToDevice);
        break;
    case MatrixType::D:
-        cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Dnnzs + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
+                   values, sizeof(Scalar) * this->dim_wells * this->dim_wells,
+                   cudaMemcpyHostToDevice);
        break;
    case MatrixType::B:
-        cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-        if (num_std_wells_so_far == num_std_wells - 1) {
-            val_pointers[num_std_wells] = num_blocks;
-            cudaMemcpy(d_val_pointers, val_pointers.data(), sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Bnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
+                   values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Bcols + this->num_blocks_so_far, colIndices,
+                   sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
+        if (this->num_std_wells_so_far == this->num_std_wells - 1) {
+            this->val_pointers[this->num_std_wells] = this->num_blocks;
+            cudaMemcpy(d_val_pointers, this->val_pointers.data(),
+                       sizeof(unsigned int) * (this->num_std_wells + 1),
+                       cudaMemcpyHostToDevice);
        }
        break;
    default:
@ -224,13 +251,16 @@ void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, doubl
    cudaCheckLastError("WellContributions::addMatrix() failed");
 }

-void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
+template<class Scalar>
+void WellContributionsCuda<Scalar>::setCudaStream(cudaStream_t stream_)
 {
    this->stream = stream_;
-    for (auto& well : multisegments) {
+    for (auto& well : this->multisegments) {
        well->setCudaStream(stream_);
    }
 }

+template class WellContributionsCuda<double>;
+
 } //namespace Opm

--- a/opm/simulators/linalg/bda/cuda/cuWellContributions.hpp
+++ b/opm/simulators/linalg/bda/cuda/cuWellContributions.hpp
@ -25,10 +25,10 @@
 #include <cuda_runtime.h>


-namespace Opm
-{
+namespace Opm {

-class WellContributionsCuda : public WellContributions
+template<class Scalar>
+class WellContributionsCuda : public WellContributions<Scalar>
 {
 public:
    ~WellContributionsCuda() override;
@ -41,33 +41,35 @@ public:
    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
    /// \param[in] d_x        vector x, must be on GPU
    /// \param[inout] d_y     vector y, must be on GPU
-    void apply(double *d_x, double *d_y);
+    void apply(Scalar* d_x, Scalar* d_y);

 protected:
    /// Allocate memory for the StandardWells
    void APIalloc() override;

+    using MatrixType = typename WellContributions<Scalar>::MatrixType;
+
    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
    /// \param[in] type        indicate if C, D or B is sent
    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
    /// \param[in] values      array of nonzeroes
    /// \param[in] val_size    number of blocks in C or B, ignored for D
-    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+    void APIaddMatrix(MatrixType type, int* colIndices,
+                      Scalar* values, unsigned int val_size) override;

    cudaStream_t stream;

    // data for StandardWells, could remain nullptrs if not used
-    double *d_Cnnzs = nullptr;
-    double *d_Dnnzs = nullptr;
-    double *d_Bnnzs = nullptr;
-    int *d_Ccols = nullptr;
-    int *d_Bcols = nullptr;
-    double *d_z1 = nullptr;
-    double *d_z2 = nullptr;
+    Scalar* d_Cnnzs = nullptr;
+    Scalar* d_Dnnzs = nullptr;
+    Scalar* d_Bnnzs = nullptr;
+    int* d_Ccols = nullptr;
+    int* d_Bcols = nullptr;
+    Scalar* d_z1 = nullptr;
+    Scalar* d_z2 = nullptr;
    unsigned int *d_val_pointers = nullptr;
-    double* h_x = nullptr;
-    double* h_y = nullptr;
-
+    Scalar* h_x = nullptr;
+    Scalar* h_y = nullptr;
 };

 } //namespace Opm
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -44,22 +44,20 @@
 extern std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

 const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
 const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
 const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;

-
-template <unsigned int block_size>
-cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {
-
+template<class Scalar, unsigned int block_size>
+cusparseSolverBackend<Scalar, block_size>::
+cusparseSolverBackend(int verbosity_, int maxit_,
+                      Scalar tolerance_, unsigned int deviceID_)
+    : Base(verbosity_, maxit_, tolerance_, deviceID_)
+{
    // initialize CUDA device, stream and libraries
    cudaSetDevice(deviceID);
    cudaCheckLastError("Could not get device");
@ -67,7 +65,8 @@ cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int max
    cudaGetDeviceProperties(&props, deviceID);
    cudaCheckLastError("Could not get device properties");
    std::ostringstream out;
-    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+    out << "Name GPU: " << props.name << ", Compute Capability: "
+        << props.major << "." << props.minor;
    OpmLog::info(out.str());

    cudaStreamCreate(&stream);
@ -84,26 +83,29 @@ cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int max
    cudaCheckLastError("Could not set stream to cusparse");
 }

-template <unsigned int block_size>
-cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
+template<class Scalar, unsigned int block_size>
+cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
+{
    finalize();
 }

-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
+{
    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
    int n = N;
-    double rho = 1.0, rhop;
-    double alpha, nalpha, beta;
-    double omega, nomega, tmp1, tmp2;
-    double norm, norm_0;
-    double zero = 0.0;
-    double one  = 1.0;
-    double mone = -1.0;
+    Scalar rho = 1.0, rhop;
+    Scalar alpha, nalpha, beta;
+    Scalar omega, nomega, tmp1, tmp2;
+    Scalar norm, norm_0;
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;
+    Scalar mone = -1.0;
    float it;

    if (wellContribs.getNumWells() > 0) {
-        static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
+        static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
    }

    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -147,7 +149,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
        }

        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -178,7 +180,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
        }

        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@ -190,7 +192,6 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);

-
        if (norm < tolerance * norm_0) {
            break;
        }
@ -210,15 +211,18 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon

    if (verbosity > 0) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        out << "=== converged: " << res.converged << ", conv_rate: "
+            << res.conv_rate << ", time: " << res.elapsed
+            << ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
    }
 }

-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -232,46 +236,49 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
    }

    std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
+    out << "Initializing GPU, matrix size: " << Nb
+        << " blockrows, nnz: " << nnzb << " blocks\n";
    if (useJacMatrix) {
        out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
    }
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "Maxit: " << maxit << std::scientific
+        << ", tolerance: " << tolerance << "\n";
    OpmLog::info(out.str());

-    cudaMalloc((void**)&d_x, sizeof(double) * N);
-    cudaMalloc((void**)&d_b, sizeof(double) * N);
-    cudaMalloc((void**)&d_r, sizeof(double) * N);
-    cudaMalloc((void**)&d_rw, sizeof(double) * N);
-    cudaMalloc((void**)&d_p, sizeof(double) * N);
-    cudaMalloc((void**)&d_pw, sizeof(double) * N);
-    cudaMalloc((void**)&d_s, sizeof(double) * N);
-    cudaMalloc((void**)&d_t, sizeof(double) * N);
-    cudaMalloc((void**)&d_v, sizeof(double) * N);
-    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
    cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
    cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
    if (useJacMatrix) {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
        cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
        cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
    } else {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
        d_mCols = d_bCols;
        d_mRows = d_bRows;
    }
    cudaCheckLastError("Could not allocate enough memory on GPU");

 #if COPY_ROW_BY_ROW
-    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+    cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
    cudaCheckLastError("Could not allocate pinned memory");
 #endif

    initialized = true;
 } // end initialize()

-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::finalize() {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::finalize()
+{
    if (initialized) {
        cudaFree(d_x);
        cudaFree(d_b);
@ -307,40 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
    }
 } // end finalize()

-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                   Scalar* b,
+                   std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
+{
    Timer t;

-    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);

 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
 	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
+                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
+                        cudaMemcpyHostToDevice, stream);
    } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals,
+                        nnz  * sizeof(Scalar),
+                        cudaMemcpyDeviceToDevice, stream);
    }
 #endif

    if (useJacMatrix) {
-        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
+                        cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
+                        cudaMemcpyHostToDevice, stream);
    }

    if (verbosity >= 3) {
@ -353,33 +374,43 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
    }
 } // end copy_system_to_gpu()

-
 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                     Scalar* b,
+                     std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
+{
    Timer t;

-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
    
 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+        if (omp_get_max_threads() > 1) {
+           copyThread->join();
+        }
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, 
+                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
+                        cudaMemcpyHostToDevice, stream);
    } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(Scalar),
+                        cudaMemcpyDeviceToDevice, stream);
    }
 #endif

@ -394,10 +425,9 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
    }
 } // end update_system_on_gpu()

-
-template <unsigned int block_size>
-bool cusparseSolverBackend<block_size>::analyse_matrix() {
-
+template<class Scalar, unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
+{
    int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    Timer t;

@ -472,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
    return true;
 } // end analyse_matrix()

-template <unsigned int block_size>
-bool cusparseSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
+{
    Timer t;

    cusparseDbsrilu02(cusparseHandle, order, \
@ -497,23 +528,24 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
    return true;
 } // end create_preconditioner()

-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
+{
    // actually solve
    gpu_pbicgstab(wellContribs, res);
    cudaStreamSynchronize(stream);
    cudaCheckLastError("Something went wrong during the GPU solve");
 } // end solve_system()

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
+{
    Timer t;

-    cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+    cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    if (verbosity > 2) {
@ -523,14 +555,13 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
-
-template <unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                              double *b,
-                                                              std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                              WellContributions& wellContribs,
-                                                              BdaResult &res)
+template<class Scalar, unsigned int block_size>
+SolverStatus cusparseSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix, jacMatrix);
@ -551,18 +582,14 @@ SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<Blo
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

+#define INSTANTIATE_TYPE(T)                    \
+    template class cusparseSolverBackend<T,1>; \
+    template class cusparseSolverBackend<T,2>; \
+    template class cusparseSolverBackend<T,3>; \
+    template class cusparseSolverBackend<T,4>; \
+    template class cusparseSolverBackend<T,5>; \
+    template class cusparseSolverBackend<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                       \
-template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int);  \
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@ -28,16 +28,13 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class cusparseSolverBackend : public BdaSolver<block_size> {
-
-    typedef BdaSolver<block_size> Base;
+template<class Scalar, unsigned int block_size>
+class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
+{
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -50,7 +47,6 @@ class cusparseSolverBackend : public BdaSolver<block_size> {
    using Base::initialized;

 private:
-
    cublasHandle_t cublasHandle;
    cusparseHandle_t cusparseHandle;
    cudaStream_t stream;
@ -58,13 +54,13 @@ private:
    bsrilu02Info_t info_M;
    bsrsv2Info_t info_L, info_U;
    // b: bsr matrix, m: preconditioner
-    double *d_bVals, *d_mVals;
+    Scalar *d_bVals, *d_mVals;
    int *d_bCols, *d_mCols;
    int *d_bRows, *d_mRows;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
-    double *d_pw, *d_s, *d_t, *d_v;
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    Scalar *d_pw, *d_s, *d_t, *d_v;
    void *d_buffer;
-    double *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
+    Scalar *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp

    bool analysis_done = false;

@ -77,12 +73,13 @@ private:
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
    /// \param[in] matrix         matrix for spmv
    /// \param[in] jacMatrix      matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Clean memory
    void finalize();
@ -92,14 +89,18 @@ private:
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                            Scalar* b,
+                            std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    /// also copy matrix for preconditioner if needed
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Analyse sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
@ -112,17 +113,16 @@ private:
    /// Solve linear system
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions& wellContribs, BdaResult &res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);

 public:
-
-
    /// Construct a cusparseSolver
    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
    /// \param[in] deviceID                   the device to be used
-    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
+    cusparseSolverBackend(int linear_solver_verbosity, int maxit,
+                          Scalar tolerance, unsigned int deviceID);

    /// Destroy a cusparseSolver, and free memory
    ~cusparseSolverBackend();
@ -134,17 +134,19 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
+                              BdaResult& res) override;
    
    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;

 }; // end class cusparseSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif

--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@ -31,33 +31,29 @@

 #include <sstream>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_) :
-    Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
+template<class Scalar, unsigned int block_size>
+BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
+    : Base(verbosity_)
+    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
 #if CHOW_PATEL
    chowPatelIlu.setVerbosity(verbosity);
 #endif
 }

-
-template <unsigned int block_size>
-bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }

-
-template <unsigned int block_size>
-bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;

@ -77,30 +73,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
        CSCRowIndices.resize(matToDecompose->nnzbs);
        CSCColPointers.resize(Nb + 1);

-        LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
+        LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);

        Timer t_convert;
-        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
+        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers,
+                        CSCRowIndices.data(), CSCColPointers.data(), Nb);
        if(verbosity >= 3){
            std::ostringstream out;
            out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
            OpmLog::info(out.str());
        }
    } else {
-        LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
+        LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);
    }

    Timer t_analysis;
    std::ostringstream out;
    if (opencl_ilu_parallel) {
        out << "opencl_ilu_parallel: true (level_scheduling)\n";
-        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers,
+                            CSCRowIndices.data(), CSCColPointers.data(), Nb,
+                            &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
    } else {
        out << "opencl_ilu_parallel: false\n";
        // numColors = 1;
        // rowsPerColor.emplace_back(Nb);
        numColors = Nb;
-        for(int i = 0; i < Nb; ++i){
+        for (int i = 0; i < Nb; ++i) {
            rowsPerColor.emplace_back(1);
        }
    }
@ -118,44 +117,52 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    invDiagVals.resize(mat->Nb * bs * bs);

 #if CHOW_PATEL
-    Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
-    Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
+    Lmat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
+    Umat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
 #endif

-    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
+    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * mat->Nb);
    s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
    s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
    s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
 #if CHOW_PATEL
-    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
    s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
    s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
-    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
+    s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
    s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
    s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
 #else
-    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
+    s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * LUmat->nnzbs);
    s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
    s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
 #endif

    events.resize(3);
-    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0,
+                                    mat->Nb * sizeof(Scalar) * bs * bs,
+                                    invDiagVals.data(), nullptr, &events[0]);

    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
    for (int i = 0; i < numColors; ++i) {
        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
    }

-    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0,
+                                     (numColors + 1) * sizeof(int),
+                                     rowsPerColorPrefix.data(), nullptr, &events[1]);

    if (opencl_ilu_parallel) {
-        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
+                                         Nb * sizeof(unsigned), fromOrder.data(),
+                                         nullptr, &events[2]);
    } else {
        // fromOrder is not initialized, so use something else to fill s.rowIndices
        // s.rowIndices[i] == i must hold, since every rowidx is mapped to itself (i.e. no actual mapping)
        // rowsPerColorPrefix is misused here, it contains an increasing sequence (0, 1, 2, ...)
-        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), rowsPerColorPrefix.data(), nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
+                                         Nb * sizeof(unsigned),
+                                         rowsPerColorPrefix.data(), nullptr, &events[2]);
    }

    cl::WaitForEvents(events);
@ -168,17 +175,15 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    return true;
 }

-
-
-template <unsigned int block_size>
-bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }

-
-template <unsigned int block_size>
-bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
+template<class Scalar, unsigned int block_size>
+bool BILU0<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;

@ -186,7 +191,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix

    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    Timer t_copy;
-    memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
+    memcpy(LUmat->nnzValues, matToDecompose->nnzValues,
+           sizeof(Scalar) * bs * bs * matToDecompose->nnzbs);

    if (verbosity >= 3){
        std::ostringstream out;
@ -205,7 +211,9 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
    Timer t_copyToGpu;

    events.resize(1);
-    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
+    queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0,
+                              LUmat->nnzbs * bs * bs * sizeof(Scalar),
+                              LUmat->nnzValues, nullptr, &events[0]);

    std::call_once(pattern_uploaded, [&](){
        // find the positions of each diagonal block
@ -213,14 +221,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
            int rowStart = LUmat->rowPointers[row];
            int rowEnd = LUmat->rowPointers[row+1];

-            auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
+            auto candidate = std::find(LUmat->colIndices + rowStart,
+                                       LUmat->colIndices + rowEnd, row);
            assert(candidate != LUmat->colIndices + rowEnd);
            diagIndex[row] = candidate - LUmat->colIndices;
        }
        events.resize(4);
-        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
-        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
-        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
+        queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int),
+                                  diagIndex.data(), nullptr, &events[1]);
+        queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int),
+                                  LUmat->colIndices, nullptr, &events[2]);
+        queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int),
+                                  LUmat->rowPointers, nullptr, &events[3]);
    });

    cl::WaitForEvents(events);
@ -242,11 +254,12 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
        const unsigned int firstRow = rowsPerColorPrefix[color];
        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
        if (verbosity >= 5) {
-            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
+            out << "color " << color << ": " << firstRow << " - " << lastRow
+                << " = " << lastRow - firstRow << "\n";
        }
-        OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
-                                  s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
-                                  s.invDiagVals, rowsPerColor[color], block_size);
+        OpenclKernels<Scalar>::ILU_decomp(firstRow, lastRow, s.rowIndices,
+                                          s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
+                                          s.invDiagVals, rowsPerColor[color], block_size);
    }

    if (verbosity >= 3) {
@ -259,43 +272,42 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
    return true;
 } // end create_preconditioner()

-
 // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
 // however, if individual kernel calls are timed, waiting for events is needed
 // behavior on other GPUs is untested
-template <unsigned int block_size>
-void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+template<class Scalar, unsigned int block_size>
+void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
 {
-    const double relaxation = 0.9;
+    const Scalar relaxation = 0.9;
    cl::Event event;
    Timer t_apply;

    for (int color = 0; color < numColors; ++color) {
 #if CHOW_PATEL
-        OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
-                                  s.diagIndex, y, x, s.rowsPerColor,
-                                  color, rowsPerColor[color], block_size);
+        OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
+                                          s.diagIndex, y, x, s.rowsPerColor,
+                                          color, rowsPerColor[color], block_size);
 #else
-        OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
-                                  s.diagIndex, y, x, s.rowsPerColor,
-                                  color, rowsPerColor[color], block_size);
+        OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
+                                          s.diagIndex, y, x, s.rowsPerColor,
+                                          color, rowsPerColor[color], block_size);
 #endif
    }

    for (int color = numColors - 1; color >= 0; --color) {
 #if CHOW_PATEL
-        OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
-                                  s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
-                                  color, rowsPerColor[color], block_size);
+        OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
+                                          s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
+                                          color, rowsPerColor[color], block_size);
 #else
-        OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
-                                  s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
-                                  color, rowsPerColor[color], block_size);
+        OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
+                                          s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
+                                          color, rowsPerColor[color], block_size);
 #endif
    }

    // apply relaxation
-    OpenclKernels::scale(x, relaxation, N);
+    OpenclKernels<Scalar>::scale(x, relaxation, N);

    if (verbosity >= 4) {
        std::ostringstream out;
@ -304,20 +316,14 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
    }
 }

+#define INSTANCE_TYPE(T)       \
+    template class BILU0<T,1>; \
+    template class BILU0<T,2>; \
+    template class BILU0<T,3>; \
+    template class BILU0<T,4>; \
+    template class BILU0<T,5>; \
+    template class BILU0<T,6>;

+INSTANCE_TYPE(double)

-#define INSTANTIATE_BDA_FUNCTIONS(n) \
-template class BILU0<n>;
-
-
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@ -29,18 +29,15 @@
 #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>


-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a Blocked ILU0 preconditioner
 /// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
 /// The preconditioner is applied via two exact triangular solves
-template <unsigned int block_size>
-class BILU0 : public Preconditioner<block_size>
+template<class Scalar, unsigned int block_size>
+class BILU0 : public Preconditioner<Scalar,block_size>
 {
-    typedef Preconditioner<block_size> Base;
+    using Base = Preconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -53,11 +50,11 @@ class BILU0 : public Preconditioner<block_size>
    using Base::err;

 private:
-    std::unique_ptr<BlockedMatrix> LUmat = nullptr;
+    std::unique_ptr<BlockedMatrix<Scalar>> LUmat{};
 #if CHOW_PATEL
-    std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
+    std::unique_ptr<BlockedMatrix<Scalar>> Lmat{}, Umat{};
 #endif
-    std::vector<double> invDiagVals;
+    std::vector<Scalar> invDiagVals;
    std::vector<int> diagIndex;
    std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
    std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
@ -67,7 +64,7 @@ private:

    bool opencl_ilu_parallel;

-    typedef struct {
+    struct GPU_storage {
        cl::Buffer invDiagVals;    // nnz values of diagonal blocks of the matrix, inverted
        cl::Buffer diagIndex;      // index of diagonal block of each row, used to differentiate between lower and upper triangular part
        cl::Buffer rowsPerColor;   // number of rows for every color
@ -80,7 +77,7 @@ private:
 #else
        cl::Buffer LUvals, LUcols, LUrows;
 #endif
-    } GPU_storage;
+    };

    GPU_storage s;

@ -93,21 +90,25 @@ public:
    BILU0(bool opencl_ilu_parallel, int verbosity);

    // analysis, extract parallelism if specified
-    bool analyze_matrix(BlockedMatrix *mat) override;
-    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat,
+                        BlockedMatrix<Scalar>* jacMat) override;

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix *mat) override;
-    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat,
+                               BlockedMatrix<Scalar>* jacMat) override;

    // apply preconditioner, x = prec(y)
    // via Lz = y
    // and Ux = z
    void apply(const cl::Buffer& y, cl::Buffer& x) override;

-    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
+    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
+    get_preconditioner_structure()
    {
-        return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
+        return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)},
+                {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
    }

    std::pair<cl::Buffer, cl::Buffer> get_preconditioner_data()
@ -120,8 +121,6 @@ public:
    }
 };

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
-
--- a/opm/simulators/linalg/bda/opencl/BISAI.cpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp
@ -34,26 +34,25 @@

 #include <sstream>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-BISAI<block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_) :
-    Preconditioner<block_size>(verbosity_)
+template<class Scalar, unsigned int block_size>
+BISAI<Scalar,block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_)
+    : Base(verbosity_)
 {
 #if CHOW_PATEL
    OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
 #endif
-    bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel_, verbosity_);
+    bilu0 = std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
 }

-template <unsigned int block_size>
-void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
+template<class Scalar, unsigned int block_size>
+void BISAI<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_,
+          std::shared_ptr<cl::CommandQueue>& queue_)
 {
    context = context_;
    queue = queue_;
@ -61,7 +60,9 @@ void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::s
    bilu0->setOpencl(context, queue);
 }

-std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices){
+std::vector<int>
+buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices)
+{
    std::vector<int> aux(colPointers); // colPointers must be copied to this vector
    std::vector<int> csrToCscOffsetMap(rowIndices.size()); // map must have the same size as the indices vector

@ -77,14 +78,15 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
    return csrToCscOffsetMap;
 }

-template <unsigned int block_size>
-bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
+template<class Scalar, unsigned int block_size>
+bool BISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
 {
    return analyze_matrix(mat, nullptr);
 }

-template <unsigned int block_size>
-bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
+template<class Scalar, unsigned int block_size>
+bool BISAI<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;
    auto *m = mat;
@ -105,21 +107,22 @@ bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    }
 }

-template <unsigned int block_size>
-void BISAI<block_size>::buildLowerSubsystemsStructures(){
+template<class Scalar, unsigned int block_size>
+void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
+{
    lower.subsystemPointers.assign(Nb + 1, 0);

    Dune::Timer t_buildLowerSubsystemsStructures;

-    for(int tcol = 0; tcol < Nb; tcol++){
+    for (int tcol = 0; tcol < Nb; tcol++) {
        int frow = diagIndex[tcol] + 1;
        int lrow = colPointers[tcol + 1];
        int nx = lrow - frow;
        int nv = 0;

-        for(int sweep = 0; sweep < nx - 1; sweep++){
-            for(int xid = sweep + 1; xid < nx; xid++){
-                for(int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++){
+        for (int sweep = 0; sweep < nx - 1; sweep++) {
+            for (int xid = sweep + 1; xid < nx; xid++) {
+                for( int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++) {
                    if(rowIndices[ptr] == rowIndices[frow + xid]){
                        lower.nzIndices.push_back(csrToCscOffsetMap[ptr]);
                        lower.knownRhsIndices.push_back(csrToCscOffsetMap[frow + sweep]);
@ -133,29 +136,31 @@ void BISAI<block_size>::buildLowerSubsystemsStructures(){
        lower.subsystemPointers[tcol + 1] = lower.subsystemPointers[tcol] + nv;
    }

-    if(verbosity >= 4){
+    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "BISAI buildLowerSubsystemsStructures time: " << t_buildLowerSubsystemsStructures.stop() << " s";
+        out << "BISAI buildLowerSubsystemsStructures time: "
+            << t_buildLowerSubsystemsStructures.stop() << " s";
        OpmLog::info(out.str());
    }
 }

-template <unsigned int block_size>
-void BISAI<block_size>::buildUpperSubsystemsStructures(){
+template<class Scalar, unsigned int block_size>
+void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
+{
    upper.subsystemPointers.assign(Nb + 1, 0);

    Dune::Timer t_buildUpperSubsystemsStructures;

-    for(int tcol = 0; tcol < Nb; tcol++){
+    for (int tcol = 0; tcol < Nb; tcol++) {
        int frow = colPointers[tcol];
        int lrow = diagIndex[tcol];
        int nx = lrow - frow + 1;
        int nv = 0;

-        for(int sweep = 0; sweep < nx - 1; sweep++){
-            for(int xid = 0; xid < nx; xid++){
-                for(int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++){
-                    if(rowIndices[ptr] == rowIndices[lrow - xid]){
+        for (int sweep = 0; sweep < nx - 1; sweep++) {
+            for (int xid = 0; xid < nx; xid++) {
+                for (int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++) {
+                    if (rowIndices[ptr] == rowIndices[lrow - xid]) {
                        upper.nzIndices.push_back(csrToCscOffsetMap[ptr]);
                        upper.knownRhsIndices.push_back(csrToCscOffsetMap[lrow - sweep]);
                        upper.unknownRhsIndices.push_back(csrToCscOffsetMap[lrow - xid]);
@ -168,15 +173,17 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
        upper.subsystemPointers[tcol + 1] = upper.subsystemPointers[tcol] + nv;
    }

-    if(verbosity >= 4){
+    if (verbosity >= 4) {
        std::ostringstream out;
-        out << "BISAI buildUpperSubsystemsStructures time: " << t_buildUpperSubsystemsStructures.stop() << " s";
+        out << "BISAI buildUpperSubsystemsStructures time: "
+            << t_buildUpperSubsystemsStructures.stop() << " s";
        OpmLog::info(out.str());
    }
 }

-template <unsigned int block_size>
-bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
+template<class Scalar, unsigned int block_size>
+bool BISAI<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
 {
    const unsigned int bs = block_size;

@ -199,48 +206,93 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
        buildLowerSubsystemsStructures();
        buildUpperSubsystemsStructures();

-        d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * colPointers.size());
-        d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * rowIndices.size());
-        d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * csrToCscOffsetMap.size());
-        d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * diagIndex.size());
-        d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
-        d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
-        d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb * bs);
-        d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.subsystemPointers.size());
-        d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.subsystemPointers.size());
+        d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                   sizeof(int) * colPointers.size());
+        d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                  sizeof(int) * rowIndices.size());
+        d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                         sizeof(int) * csrToCscOffsetMap.size());
+        d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                 sizeof(int) * diagIndex.size());
+        d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                sizeof(Scalar) * nnzb * bs * bs);
+        d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                sizeof(Scalar) * nnzb * bs * bs);
+        d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                              sizeof(Scalar) * Nb * bs);
+        d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                               sizeof(int) * lower.subsystemPointers.size());
+        d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                               sizeof(int) * upper.subsystemPointers.size());

-        if(!lower.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
-            d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.nzIndices.size());
-            d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.knownRhsIndices.size());
-            d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.unknownRhsIndices.size());
+        if (!lower.nzIndices.empty()) { // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
+            d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                           sizeof(int) * lower.nzIndices.size());
+            d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                                 sizeof(int) * lower.knownRhsIndices.size());
+            d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                                   sizeof(int) * lower.unknownRhsIndices.size());
        }

-        if(!upper.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
-            d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.nzIndices.size());
-            d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.knownRhsIndices.size());
-            d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.unknownRhsIndices.size());
+        if (!upper.nzIndices.empty()) { // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
+            d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                           sizeof(int) * upper.nzIndices.size());
+            d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                                 sizeof(int) * upper.knownRhsIndices.size());
+            d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                                                   sizeof(int) * upper.unknownRhsIndices.size());
        }

        events.resize(6);
-        err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0, colPointers.size() * sizeof(int), colPointers.data(), nullptr, &events[0]);
-        err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0, rowIndices.size() * sizeof(int), rowIndices.data(), nullptr, &events[1]);
-        err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0, csrToCscOffsetMap.size() * sizeof(int), csrToCscOffsetMap.data(), nullptr, &events[2]);
-        err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, diagIndex.size() * sizeof(int), diagIndex.data(), nullptr, &events[3]);
-        err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0, sizeof(int) * lower.subsystemPointers.size(), lower.subsystemPointers.data(), nullptr, &events[4]);
-        err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0, sizeof(int) * upper.subsystemPointers.size(), upper.subsystemPointers.data(), nullptr, &events[5]);
+        err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0,
+                                        colPointers.size() * sizeof(int),
+                                        colPointers.data(), nullptr, &events[0]);
+        err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0,
+                                         rowIndices.size() * sizeof(int),
+                                         rowIndices.data(), nullptr, &events[1]);
+        err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0,
+                                         csrToCscOffsetMap.size() * sizeof(int),
+                                         csrToCscOffsetMap.data(), nullptr, &events[2]);
+        err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0,
+                                         diagIndex.size() * sizeof(int),
+                                         diagIndex.data(), nullptr, &events[3]);
+        err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0,
+                                         sizeof(int) * lower.subsystemPointers.size(),
+                                         lower.subsystemPointers.data(), nullptr, &events[4]);
+        err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0,
+                                         sizeof(int) * upper.subsystemPointers.size(),
+                                         upper.subsystemPointers.data(), nullptr, &events[5]);

-        if(!lower.nzIndices.empty()){
+        if (!lower.nzIndices.empty()) {
            events.resize(events.size() + 3);
-            err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0, sizeof(int) * lower.nzIndices.size(), lower.nzIndices.data(), nullptr, &events[events.size() - 3]);
-            err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.knownRhsIndices.size(), lower.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
-            err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.unknownRhsIndices.size(), lower.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
+            err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0,
+                                             sizeof(int) * lower.nzIndices.size(),
+                                             lower.nzIndices.data(), nullptr,
+                                             &events[events.size() - 3]);
+            err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0,
+                                             sizeof(int) * lower.knownRhsIndices.size(),
+                                             lower.knownRhsIndices.data(), nullptr,
+                                             &events[events.size() - 2]);
+            err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0,
+                                             sizeof(int) * lower.unknownRhsIndices.size(),
+                                             lower.unknownRhsIndices.data(), nullptr,
+                                             &events[events.size() - 1]);
        }

-        if(!upper.nzIndices.empty()){
+        if (!upper.nzIndices.empty()) {
            events.resize(events.size() + 3);
-            err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE, 0, sizeof(int) * upper.nzIndices.size(), upper.nzIndices.data(), nullptr, &events[events.size() - 3]);
-            err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.knownRhsIndices.size(), upper.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
-            err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.unknownRhsIndices.size(), upper.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
+            err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE,
+                                             0, sizeof(int) * upper.nzIndices.size(),
+                                             upper.nzIndices.data(), nullptr,
+                                             &events[events.size() - 3]);
+            err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0,
+                                             sizeof(int) * upper.knownRhsIndices.size(),
+                                             upper.knownRhsIndices.data(), nullptr,
+                                             &events[events.size() - 2]);
+            err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0,
+                                             sizeof(int) * upper.unknownRhsIndices.size(),
+                                             upper.unknownRhsIndices.data(), nullptr,
+                                             &events[events.size() - 1]);
        }

        cl::WaitForEvents(events);
@ -255,16 +307,24 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
    std::tie(d_LUvals, d_invDiagVals) = bilu0->get_preconditioner_data();

    events.resize(2);
-    err = queue->enqueueFillBuffer(d_invLvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[0]);
-    err |= queue->enqueueFillBuffer(d_invUvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[1]);
+    err = queue->enqueueFillBuffer(d_invLvals, 0, 0,
+                                   sizeof(Scalar) * nnzb * bs * bs, nullptr, &events[0]);
+    err |= queue->enqueueFillBuffer(d_invUvals, 0, 0,
+                                    sizeof(Scalar) * nnzb * bs * bs, nullptr, &events[1]);
    cl::WaitForEvents(events);
    events.clear();

-    OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
-    OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
+    OpenclKernels<Scalar>::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap,
+                                 d_lower.subsystemPointers, d_lower.nzIndices,
+                                 d_lower.unknownRhsIndices, d_lower.knownRhsIndices,
+                                 d_LUvals, d_invLvals, Nb);
+    OpenclKernels<double>::isaiU(d_diagIndex, d_colPointers, d_rowIndices,
+                                 d_csrToCscOffsetMap, d_upper.subsystemPointers,
+                                 d_upper.nzIndices, d_upper.unknownRhsIndices,
+                                 d_upper.knownRhsIndices, d_LUvals,
            d_invDiagVals, d_invUvals, Nb);

-    if(verbosity >= 4){
+    if (verbosity >= 4) {
        std::ostringstream out;
        out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
        OpmLog::info(out.str());
@ -273,33 +333,34 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
    return true;
 }

-template <unsigned int block_size>
-bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
+template<class Scalar, unsigned int block_size>
+bool BISAI<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat)
 {
    return create_preconditioner(mat, nullptr);
 }

-template <unsigned int block_size>
-void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
+template<class Scalar, unsigned int block_size>
+void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
+{
    const unsigned int bs = block_size;

-    OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
-                                                                                                   // (to compensate for the unitary diagonal that is not
-                                                                                                   // included in isaiL, for simplicity)
-    OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
+    OpenclKernels<Scalar>::spmv(d_invLvals, d_rowIndices, d_colPointers,
+                                x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
+                                                                  // (to compensate for the unitary diagonal that is not
+                                                                  // included in isaiL, for simplicity)
+    OpenclKernels<Scalar>::spmv(d_invUvals, d_rowIndices, d_colPointers,
+                                d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
 }

-#define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template class BISAI<n>;
+#define INSTANCE_TYPE(T)       \
+    template class BISAI<T,1>; \
+    template class BISAI<T,2>; \
+    template class BISAI<T,3>; \
+    template class BISAI<T,4>; \
+    template class BISAI<T,5>; \
+    template class BISAI<T,6>;

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
+INSTANCE_TYPE(double)

-#undef INSTANTIATE_BDA_FUNCTIONS
-
-}
-}
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/BISAI.hpp
+++ b/opm/simulators/linalg/bda/opencl/BISAI.hpp
@ -26,19 +26,16 @@
 #include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
 #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

 /// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
 /// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
-template <unsigned int block_size>
-class BISAI : public Preconditioner<block_size>
+template<class Scalar, unsigned int block_size>
+class BISAI : public Preconditioner<Scalar,block_size>
 {
-    typedef Preconditioner<block_size> Base;
+    using Base = Preconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -57,8 +54,8 @@ private:
    std::vector<int> rowIndices;
    std::vector<int> diagIndex;
    std::vector<int> csrToCscOffsetMap;
-    std::vector<double> invLvals;
-    std::vector<double> invUvals;
+    std::vector<Scalar> invLvals;
+    std::vector<Scalar> invUvals;

    cl::Buffer d_colPointers;
    cl::Buffer d_rowIndices;
@ -71,10 +68,10 @@ private:
    cl::Buffer d_invL_x;

    bool opencl_ilu_parallel;
-    std::unique_ptr<BILU0<block_size> > bilu0;
+    std::unique_ptr<BILU0<Scalar,block_size>> bilu0;

    /// Struct that holds the structure of the small subsystems for each column
-    typedef struct{
+    struct subsystemStructure {
        /// This vector holds the cumulative sum for the number of non-zero blocks for each subsystem.
        /// Works similarly to row and column pointers for the CSR and CSC matrix representations.
        std::vector<int> subsystemPointers;
@ -88,15 +85,15 @@ private:
        std::vector<int> knownRhsIndices;
        /// This vector holds the indices of the unknown values of the right hand sides of the subsystems.
        std::vector<int> unknownRhsIndices;
-    } subsystemStructure;
+    };

    /// GPU version of subsystemStructure
-    typedef struct{
+    struct subsystemStructureGPU {
        cl::Buffer subsystemPointers;
        cl::Buffer nzIndices;
        cl::Buffer knownRhsIndices;
        cl::Buffer unknownRhsIndices;
-    } subsystemStructureGPU;
+    } ;

    subsystemStructure lower, upper;
    subsystemStructureGPU d_lower, d_upper;
@ -113,15 +110,18 @@ public:
    BISAI(bool opencl_ilu_parallel, int verbosity);

    // set own Opencl variables, but also that of the bilu0 preconditioner
-    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
+    void setOpencl(std::shared_ptr<cl::Context>& context,
+                   std::shared_ptr<cl::CommandQueue>& queue) override;

    // analysis, extract parallelism
-    bool analyze_matrix(BlockedMatrix *mat) override;
-    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat,
+                        BlockedMatrix<Scalar>* jacMat) override;

    // ilu_decomposition
-    bool create_preconditioner(BlockedMatrix *mat) override;
-    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat,
+                               BlockedMatrix<Scalar>* jacMat) override;

    // apply preconditioner, x = prec(y)
    void apply(const cl::Buffer& y, cl::Buffer& x) override;
@ -132,7 +132,6 @@ public:
 /// in the csrToCscOffsetMap[i]-th position in the CSC representation.
 std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices);

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
--- a/opm/simulators/linalg/bda/opencl/CPR.cpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.cpp
@ -34,37 +34,32 @@
 #include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
 #include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>

+namespace Opm::Accelerator {

-namespace Opm
-{
-namespace Accelerator
-{
-
-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-CPR<block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_) :
-    Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
+template<class Scalar, unsigned int block_size>
+CPR<Scalar,block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
+    : Base(verbosity_)
+    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
-    bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
+    bilu0 = std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
    diagIndices.resize(1);
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
-
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
+{
    context = context_;
    queue = queue_;

    bilu0->setOpencl(context, queue);
 }

-
-template <unsigned int block_size>
-bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
-
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat_)
+{
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
    this->N = Nb * block_size;
@ -75,8 +70,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
    return success;
 }

-template <unsigned int block_size>
-bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
+{
    this->Nb = mat_->Nb;
    this->nnzb = mat_->nnzbs;
    this->N = Nb * block_size;
@ -88,8 +85,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat)
    return success;
 }

-template <unsigned int block_size>
-bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
+{
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_, jacMat);
    if (verbosity >= 3) {
@ -108,8 +107,10 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *
    return result;
 }

-template <unsigned int block_size>
-bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
+template<class Scalar, unsigned int block_size>
+bool CPR<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat_)
+{
    Dune::Timer t_bilu0;
    bool result = bilu0->create_preconditioner(mat_);
    if (verbosity >= 3) {
@ -128,26 +129,30 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
    return result;
 }

-
 // return the absolute value of the N elements for which the absolute value is highest
-double get_absmax(const double *data, const int N) {
-    return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
+template<class Scalar>
+Scalar get_absmax(const Scalar* data, const int N)
+{
+    return std::abs(*std::max_element(data, data + N,
+                                      [](Scalar a, Scalar b)
+                                      { return std::fabs(a) < std::fabs(b); }));
 }

-
 // solve A^T * x = b
-void solve_transposed_3x3(const double *A, const double *b, double *x) {
+template<class Scalar>
+void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
+{
    const int B = 3;
    // from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
-    double t4  = A[0+0*B] * A[1+1*B];
-    double t6  = A[0+0*B] * A[1+2*B];
-    double t8  = A[0+1*B] * A[1+0*B];
-    double t10 = A[0+2*B] * A[1+0*B];
-    double t12 = A[0+1*B] * A[2+0*B];
-    double t14 = A[0+2*B] * A[2+0*B];
+    Scalar t4  = A[0+0*B] * A[1+1*B];
+    Scalar t6  = A[0+0*B] * A[1+2*B];
+    Scalar t8  = A[0+1*B] * A[1+0*B];
+    Scalar t10 = A[0+2*B] * A[1+0*B];
+    Scalar t12 = A[0+1*B] * A[2+0*B];
+    Scalar t14 = A[0+2*B] * A[2+0*B];

-    double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
-          t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
+    Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
+               t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant

    x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
          - b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
@ -162,44 +167,49 @@ void solve_transposed_3x3(const double *A, const double *b, double *x) {
          + A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::init_opencl_buffers() {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar, block_size>::init_opencl_buffers()
+{
    d_Amatrices.reserve(num_levels);
    d_Rmatrices.reserve(num_levels - 1);
    d_invDiags.reserve(num_levels - 1);
-    for (Matrix& m : Amatrices) {
+    for (Matrix<Scalar>& m : Amatrices) {
        d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
    }
-    for (Matrix& m : Rmatrices) {
+    for (Matrix<Scalar>& m : Rmatrices) {
        d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
-        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
-        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
+        d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
+        d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);

        d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
-        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
-        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
+        d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
+        d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
    }
-    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-    d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
-    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
-    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
+    d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+    d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+    d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), Nb, Nb, nnzb, block_size);
+    d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
+    d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::opencl_upload() {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::opencl_upload()
+{
    d_mat->upload(queue.get(), mat);

    err = CL_SUCCESS;
    events.resize(2 * Rmatrices.size() + 1);
-    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
+    err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
+                                     sizeof(Scalar) * N, weights.data(), nullptr, &events[0]);
    for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
        d_Amatrices[i].upload(queue.get(), &Amatrices[i]);

-        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
-        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
+        err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
+                                         sizeof(Scalar) * Amatrices[i].N, invDiags[i].data(),
+                                         nullptr, &events[2*i+1]);
+        err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
+                                         sizeof(int) * Amatrices[i].N, PcolIndices[i].data(),
+                                         nullptr, &events[2*i+2]);
    }
    cl::WaitForEvents(events);
    events.clear();
@ -212,9 +222,10 @@ void CPR<block_size>::opencl_upload() {
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::
+create_preconditioner_amg(BlockedMatrix<Scalar>* mat_)
+{
    this->mat = mat_;

    coarse_vals.resize(nnzb);
@ -222,8 +233,8 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
    coarse_y.resize(Nb);
    weights.resize(N);

-    try{
-        double rhs[] = {0, 0, 0};
+    try {
+        Scalar rhs[] = {0, 0, 0};
        rhs[pressure_idx] = 1;

        // find diagonal index for each row
@ -241,12 +252,12 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
        // calculate weights for each row
        for (int row = 0; row < Nb; ++row) {
            // solve to find weights
-            double *row_weights = weights.data() + block_size * row; // weights for this row
+            Scalar* row_weights = weights.data() + block_size * row; // weights for this row
            solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);

            // normalize weights for this row
-            double abs_max = get_absmax(row_weights, block_size);
-            for(unsigned int i = 0; i < block_size; i++){
+            Scalar abs_max = get_absmax(row_weights, block_size);
+            for (unsigned int i = 0; i < block_size; i++) {
                row_weights[i] /= abs_max;
            }
        }
@ -257,9 +268,9 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
            int start = mat->rowPointers[row];
            int end = mat->rowPointers[row + 1];
            for (int idx = start; idx < end; ++idx) {
-                double *block = mat->nnzValues + idx * block_size * block_size;
-                double *row_weights = weights.data() + block_size * row;
-                double value = 0.0;
+                Scalar* block = mat->nnzValues + idx * block_size * block_size;
+                Scalar* row_weights = weights.data() + block_size * row;
+                Scalar value = 0.0;
                for (unsigned int i = 0; i < block_size; ++i) {
                    value += block[block_size * i + pressure_idx] * row_weights[i];
                }
@ -276,10 +287,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
        if (recalculate_aggregates) {
            dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);

-            typedef DuneMat::CreateIterator Iter;
+            using Iter = typename DuneMat::CreateIterator;

            // setup sparsity pattern
-            for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
+            for (Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row) {
                int start = mat->rowPointers[row.index()];
                int end = mat->rowPointers[row.index() + 1];
                for (int idx = start; idx < end; ++idx) {
@ -302,7 +313,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
            Dune::Amg::SequentialInformation seqinfo;
            dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));

-            Opm::PropertyTree property_tree;
+            PropertyTree property_tree;
            property_tree.put("alpha", 0.333333333333);

            // The matrix has a symmetric sparsity pattern, but the values are not symmetric
@ -315,7 +326,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
            num_pre_smooth_steps = c.getNoPreSmoothSteps();
            num_post_smooth_steps = c.getNoPostSmoothSteps();

-            dune_amg->build<OverlapFlags>(c);
+            dune_amg->template build<OverlapFlags>(c);

            analyzeHierarchy();
            analyzeAggregateMaps();
@ -351,10 +362,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::analyzeHierarchy() {
-    const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::analyzeHierarchy()
+{
+    const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();

    // store coarsest AMG level in umfpack format, also performs LU decomposition
    umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
@ -372,8 +383,8 @@ void CPR<block_size>::analyzeHierarchy() {

    // matrixIter.dereference() returns MatrixAdapter
    // matrixIter.dereference().getmat() returns BCRSMat
-    DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
-    for(int level = 0; level < num_levels; ++matrixIter, ++level) {
+    typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
+    for (int level = 0; level < num_levels; ++matrixIter, ++level) {
        const auto& A = matrixIter.dereference().getmat();
        level_sizes[level] = A.N();
        diagIndices[level].reserve(A.N());
@ -395,38 +406,38 @@ void CPR<block_size>::analyzeHierarchy() {
            }
        }

-        Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
+        BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers,
+                                                                    Amatrices.back().colIndices);

        // compute inverse diagonal values for current level
        invDiags.emplace_back(A.N());
        for (unsigned int row = 0; row < A.N(); ++row) {
-            invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
+            invDiags.back()[row] = 1.0 / Amatrices.back().nnzValues[diagIndices[level][row]];
        }
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::analyzeAggregateMaps() {
-
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::analyzeAggregateMaps()
+{
    PcolIndices.resize(num_levels - 1);
    Rmatrices.clear();

-    const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
+    const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();

-    DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
-    for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
-        DuneAmg::AggregatesMap *map = *mapIter;
+    typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
+    for (int level = 0; level < num_levels - 1; ++mapIter, ++level) {
+        typename DuneAmg::AggregatesMap* map = *mapIter;

        Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
        std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);

        // get indices for each row of P and R
-        std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
+        std::vector<std::vector<unsigned>> indicesR(level_sizes[level+1]);
        PcolIndices[level].resize(level_sizes[level]);

-        using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
-        for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
+        using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
+        for (AggregateIterator ai = map->begin(); ai != map->end(); ++ai) {
            if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
                const long int diff = ai - map->begin();
                PcolIndices[level][diff] = *ai;
@ -446,19 +457,20 @@ void CPR<block_size>::analyzeAggregateMaps() {
    }
 }

-
-template <unsigned int block_size>
-void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
-    OpenclMatrix *A = &d_Amatrices[level];
-    OpenclMatrix *R = &d_Rmatrices[level];
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
+{
+    OpenclMatrix<Scalar>* A = &d_Amatrices[level];
+    OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
    int Ncur = A->Nb;

    if (level == num_levels - 1) {
        // solve coarsest level
-        std::vector<double> h_y(Ncur), h_x(Ncur, 0);
+        std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);

        events.resize(1);
-        err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
+        err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
+                                       sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
        cl::WaitForEvents(events);
        events.clear();
        if (err != CL_SUCCESS) {
@ -470,7 +482,8 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &
        umfpack.apply(h_x.data(), h_y.data());

        events.resize(1);
-        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
+        err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
+                                        sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
        cl::WaitForEvents(events);
        events.clear();
        if (err != CL_SUCCESS) {
@ -486,34 +499,37 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &
    cl::Buffer& u = d_u[level]; // u was 0-initialized earlier

    // presmooth
-    double jacobi_damping = 0.65; // default value in amgcl: 0.72
-    for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
-        OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-        OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+    Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
+    for (unsigned i = 0; i < num_pre_smooth_steps; ++i) {
+        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
    }

    // move to coarser level
-    OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-    OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
+    OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
+    OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
    amg_cycle_gpu(level + 1, f, u);
-    OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
+    OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);

    // postsmooth
-    for (unsigned i = 0; i < num_post_smooth_steps; ++i){
-        OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
-        OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
+    for (unsigned i = 0; i < num_post_smooth_steps; ++i) {
+        OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
+                                        x, y, t, Ncur, 1);
+        OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
    }
 }

-
 // x = prec(y)
-template <unsigned int block_size>
-void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
+{
    // 0-initialize u and x vectors
    events.resize(d_u.size() + 1);
-    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
+    err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
+                                   sizeof(Scalar) * Nb, nullptr, &events[0]);
    for (unsigned int i = 0; i < d_u.size(); ++i) {
-        err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
+        err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
+                                        sizeof(Scalar) * Rmatrices[i].N, nullptr, &events[i + 1]);
    }
    cl::WaitForEvents(events);
    events.clear();
@ -522,16 +538,18 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
        OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
    }

-    OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
-    OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
+    OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
+                                    d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
+    OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);

    amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);

-    OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
+    OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
 }

-template <unsigned int block_size>
-void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
+template<class Scalar, unsigned int block_size>
+void CPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
+{
    Dune::Timer t_bilu0;
    bilu0->apply(y, x);
    if (verbosity >= 4) {
@ -549,20 +567,14 @@ void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
    }
 }

+#define INSTANCE_TYPE(T)     \
+    template class CPR<T,1>; \
+    template class CPR<T,2>; \
+    template class CPR<T,3>; \
+    template class CPR<T,4>; \
+    template class CPR<T,5>; \
+    template class CPR<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template class CPR<n>;
-
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
-
+INSTANCE_TYPE(double)

+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/CPR.hpp
+++ b/opm/simulators/linalg/bda/opencl/CPR.hpp
@ -33,18 +33,15 @@

 #include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

 /// This class implements a Constrained Pressure Residual (CPR) preconditioner
-template <unsigned int block_size>
-class CPR : public Preconditioner<block_size>
+template<class Scalar, unsigned int block_size>
+class CPR : public Preconditioner<Scalar,block_size>
 {
-    typedef Preconditioner<block_size> Base;
+    using Base = Preconditioner<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -58,25 +55,25 @@ class CPR : public Preconditioner<block_size>

 private:
    int num_levels;
-    std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
-    std::vector<Matrix> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
-    std::vector<OpenclMatrix> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
+    std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
+    std::vector<OpenclMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
    std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
    std::vector<cl::Buffer> d_PcolIndices;
-    std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
+    std::vector<std::vector<Scalar>> invDiags; // inverse of diagonal of Amatrices
    std::vector<cl::Buffer> d_invDiags;
    std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
    std::unique_ptr<cl::Buffer> d_rs;      // use before extracting the pressure
    std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
-    std::unique_ptr<OpenclMatrix> d_mat;   // stores blocked matrix
+    std::unique_ptr<OpenclMatrix<Scalar>> d_mat;   // stores blocked matrix
    std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
    std::once_flag opencl_buffers_allocated;  // only allocate OpenCL Buffers once

-    std::unique_ptr<BILU0<block_size> > bilu0;                    // Blocked ILU0 preconditioner
-    BlockedMatrix *mat = nullptr;    // input matrix, blocked
+    std::unique_ptr<BILU0<Scalar,block_size>> bilu0;                    // Blocked ILU0 preconditioner
+    BlockedMatrix<Scalar>* mat = nullptr;    // input matrix, blocked

-    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
-    using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
+    using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
+    using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
    using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
    using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
    std::unique_ptr<DuneAmg> dune_amg;
@ -91,7 +88,7 @@ private:
    unsigned num_pre_smooth_steps;              // number of Jacobi smooth steps before restriction
    unsigned num_post_smooth_steps;             // number of Jacobi smooth steps after prolongation

-    std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
+    std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
    bool opencl_ilu_parallel;                   // whether ILU0 operation should be parallelized

    // Analyze the AMG hierarchy build by Dune
@ -112,32 +109,35 @@ private:

    void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);

-    void create_preconditioner_amg(BlockedMatrix *mat);
+    void create_preconditioner_amg(BlockedMatrix<Scalar>* mat);

 public:
    CPR(bool opencl_ilu_parallel, int verbosity);

-    bool analyze_matrix(BlockedMatrix *mat) override;
-    bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
+    bool analyze_matrix(BlockedMatrix<Scalar>* mat,
+                        BlockedMatrix<Scalar>* jacMat) override;

    // set own Opencl variables, but also that of the bilu0 preconditioner
-    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
+    void setOpencl(std::shared_ptr<cl::Context>& context,
+                   std::shared_ptr<cl::CommandQueue>& queue) override;

    // applies blocked ilu0
    // also applies amg for pressure component
    void apply(const cl::Buffer& y, cl::Buffer& x) override;

-    bool create_preconditioner(BlockedMatrix *mat) override;
-    bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
+    bool create_preconditioner(BlockedMatrix<Scalar>* mat,
+                               BlockedMatrix<Scalar>* jacMat) override;
 };

 // solve A^T * x = b
 // A should represent a 3x3 matrix
 // x and b are vectors with 3 elements
-void solve_transposed_3x3(const double *A, const double *b, double *x);
+template<class Scalar>
+void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x);

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif

--- a/opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp
+++ b/opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp
@ -31,12 +31,19 @@ namespace Opm
 namespace Accelerator
 {

-void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
+template<class Scalar>
+void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue,
+                                  Scalar* vals, int* cols, int* rows)
+{
    std::vector<cl::Event> events(3);

-    cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
-    err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
-    err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
+    cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0,
+                                           sizeof(Scalar) * block_size * block_size * nnzbs,
+                                           vals, nullptr, &events[0]);
+    err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs,
+                                     cols, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1),
+                                     rows, nullptr, &events[2]);

    cl::WaitForEvents(events);
    events.clear();
@ -46,7 +53,9 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int
    }
 }

-void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
+template<class Scalar>
+void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue, Matrix<Scalar>* matrix)
+{
    if (block_size != 1) {
        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
    }
@ -54,7 +63,9 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
    upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
 }

-void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
+template<class Scalar>
+void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue, BlockedMatrix<Scalar>* matrix)
+{
    if (matrix->block_size != block_size) {
        OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
    }
@ -62,5 +73,7 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
    upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
 }

+template class OpenclMatrix<double>;
+
 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp
+++ b/opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp
@ -29,28 +29,30 @@ namespace Opm
 namespace Accelerator
 {

-class Matrix;
-class BlockedMatrix;
+template<class Scalar> class Matrix;
+template<class Scalar> class BlockedMatrix;

 /// This struct resembles a csr matrix, only doubles are supported
 /// The matrix data is stored in OpenCL Buffers
-class OpenclMatrix {
+template<class Scalar>
+class OpenclMatrix
+{
 public:
-
    OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
    : Nb(Nb_),
      Mb(Mb_),
      nnzbs(nnzbs_),
      block_size(block_size_)
    {
-        nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
+        nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE,
+                               sizeof(Scalar) * block_size * block_size * nnzbs);
        colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
        rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
    }

-    void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
-    void upload(cl::CommandQueue *queue, Matrix *matrix);
-    void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
+    void upload(cl::CommandQueue* queue, Scalar* vals, int* cols, int* rows);
+    void upload(cl::CommandQueue* queue, Matrix<Scalar>* matrix);
+    void upload(cl::CommandQueue* queue, BlockedMatrix<Scalar>* matrix);

    cl::Buffer nnzValues;
    cl::Buffer colIndices;
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp
@ -30,61 +30,58 @@
 #include <memory>
 #include <string>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-
-template <unsigned int block_size>
-void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+template<class Scalar, unsigned int block_size>
+void Preconditioner<Scalar,block_size>::
+ setOpencl(std::shared_ptr<cl::Context>& context_,
+           std::shared_ptr<cl::CommandQueue>& queue_)
+{
    context = context_;
    queue = queue_;
 }

-template <unsigned int block_size>
-std::unique_ptr<Preconditioner<block_size>>
-Preconditioner<block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
+template<class Scalar, unsigned int block_size>
+std::unique_ptr<Preconditioner<Scalar,block_size>>
+Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
 {
    switch (type ) {
    case Type::BILU0:
-        return std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity);
+        return std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
    case Type::CPR:
-        return std::make_unique<CPR<block_size> >(opencl_ilu_parallel, verbosity);
+        return std::make_unique<CPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
    case Type::BISAI:
-        return std::make_unique<BISAI<block_size> >(opencl_ilu_parallel, verbosity);
+        return std::make_unique<BISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
    }

    OPM_THROW(std::logic_error,
              "Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
 }

-template <unsigned int block_size>
-bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+template<class Scalar, unsigned int block_size>
+bool Preconditioner<Scalar,block_size>::
+analyze_matrix(BlockedMatrix<Scalar>* mat,
+               [[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
+{
    return analyze_matrix(mat);
 }

-template <unsigned int block_size>
-bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
+template<class Scalar, unsigned int block_size>
+bool Preconditioner<Scalar,block_size>::
+create_preconditioner(BlockedMatrix<Scalar>* mat,
+                      [[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
+{
    return create_preconditioner(mat);
 }

-#define INSTANTIATE_BDA_FUNCTIONS(n)  \
-template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(Type, bool, int);         \
-template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
-template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *);                             \
-template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
+#define INSTANCE_TYPE(T)                \
+    template class Preconditioner<T,1>; \
+    template class Preconditioner<T,2>; \
+    template class Preconditioner<T,3>; \
+    template class Preconditioner<T,4>; \
+    template class Preconditioner<T,5>; \
+    template class Preconditioner<T,6>;

+INSTANCE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} //namespace Accelerator
-} //namespace Opm
-
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
+++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp
@ -24,17 +24,13 @@

 #include <memory>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-class BlockedMatrix;
+template<class Scalar> class BlockedMatrix;

-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
 class Preconditioner
 {
-
 protected:
    int N = 0;       // number of rows of the matrix
    int Nb = 0;      // number of blockrows of the matrix
@ -65,7 +61,8 @@ public:
    virtual ~Preconditioner() = default;

    // nested Preconditioners might need to override this
-    virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
+    virtual void setOpencl(std::shared_ptr<cl::Context>& context,
+                           std::shared_ptr<cl::CommandQueue>& queue);

    // apply preconditioner, x = prec(y)
    virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
@ -73,16 +70,17 @@ public:
    // analyze matrix, e.g. the sparsity pattern
    // probably only called once
    // the version with two params can be overloaded, if not, it will default to using the one param version
-    virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
-    virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
+    virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat) = 0;
+    virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat,
+                                BlockedMatrix<Scalar>* jacMat);

    // create/update preconditioner, probably used every linear solve
    // the version with two params can be overloaded, if not, it will default to using the one param version
-    virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
-    virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
+    virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat) = 0;
+    virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat,
+                                       BlockedMatrix<Scalar>* jacMat);
 };

-} //namespace Accelerator
-} //namespace Opm
+} // namespace Opm::Accelerator

 #endif
--- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp
@ -18,52 +18,71 @@
 */

 #include <config.h>
-#include <cmath>
-#include <sstream>
+#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <dune/common/timer.hh>

-#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
 #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>  // defines CHOW_PATEL

-namespace Opm
-{
-namespace Accelerator
-{
+#include <cmath>
+#include <sstream>
+
+namespace Opm::Accelerator {

 using Opm::OpmLog;
 using Dune::Timer;

 // define static variables and kernels
-int OpenclKernels::verbosity;
-cl::CommandQueue *OpenclKernels::queue;
-std::vector<double> OpenclKernels::tmp;
-bool OpenclKernels::initialized = false;
-std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
+template<class Scalar> int OpenclKernels<Scalar>::verbosity;
+template<class Scalar> cl::CommandQueue* OpenclKernels<Scalar>::queue;
+template<class Scalar> std::vector<Scalar> OpenclKernels<Scalar>::tmp;
+template<class Scalar> bool OpenclKernels<Scalar>::initialized = false;
+template<class Scalar> std::size_t OpenclKernels<Scalar>::preferred_workgroup_size_multiple = 0;

-std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
-std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
-std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
-std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
-std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
-std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
-std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
-std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
-std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
-std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
-std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
-std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
-std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
-std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
-std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
-std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
-std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
-std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
-std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
-std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
-std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::dot_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::norm_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::axpy_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > OpenclKernels<Scalar>::scale_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::vmul_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > OpenclKernels<Scalar>::custom_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::full_to_pressure_restriction_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels<Scalar>::add_coarse_pressure_correction_k;
+template<class Scalar>
+std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::prolongate_vector_k;
+template<class Scalar>
+std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_k;
+template<class Scalar>
+std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_add_k;
+template<class Scalar>
+std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_k;
+template<class Scalar>
+std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_noreset_k;
+template<class Scalar>
+std::unique_ptr<residual_blocked_kernel_type> OpenclKernels<Scalar>::residual_blocked_k;
+template<class Scalar>
+std::unique_ptr<residual_kernel_type> OpenclKernels<Scalar>::residual_k;
+template<class Scalar>
+std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels<Scalar>::ILU_apply1_k;
+template<class Scalar>
+std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels<Scalar>::ILU_apply2_k;
+template<class Scalar>
+std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels<Scalar>::stdwell_apply_k;
+template<class Scalar>
+std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels<Scalar>::ilu_decomp_k;
+template<class Scalar>
+std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
+template<class Scalar>
+std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;

 // divide A by B, and round up: return (int)ceil(A/B)
 unsigned int ceilDivision(const unsigned int A, const unsigned int B)
@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
    return A / B + (A % B > 0);
 }

-void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
+template<class Scalar>
+void OpenclKernels<Scalar>::init(cl::Context *context,
+                                 cl::CommandQueue *queue_,
+                                 std::vector<cl::Device>& devices, int verbosity_)
 {
    if (initialized) {
        OpmLog::debug("Warning OpenclKernels is already initialized");
@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    // actually creating the kernels
    dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
    norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
-    axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
-    scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
-    vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
-    custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
+    axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
+    scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int>(cl::Kernel(program, "scale")));
+    vmul_k.reset(new cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
+    custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int>(cl::Kernel(program, "custom")));
    full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
    add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
    prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    initialized = true;
 } // end get_opencl_kernels()

-double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
+template<class Scalar>
+Scalar OpenclKernels<Scalar>::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
 {
    const unsigned int work_group_size = 256;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
    Timer t_dot;
    tmp.resize(num_work_groups);

    cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));

-    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());

-    double gpu_sum = 0.0;
+    Scalar gpu_sum = 0.0;
    for (unsigned int i = 0; i < num_work_groups; ++i) {
        gpu_sum += tmp[i];
    }
@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int
    return gpu_sum;
 }

-double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
+template<class Scalar>
+Scalar OpenclKernels<Scalar>::norm(cl::Buffer& in, cl::Buffer& out, int N)
 {
    const unsigned int work_group_size = 256;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
    Timer t_norm;
    tmp.resize(num_work_groups);

    cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));

-    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
+    queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());

-    double gpu_norm = 0.0;
+    Scalar gpu_norm = 0.0;
    for (unsigned int i = 0; i < num_work_groups; ++i) {
        gpu_norm += tmp[i];
    }
@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
    return gpu_norm;
 }

-void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
+template<class Scalar>
+void OpenclKernels<Scalar>::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
    }
 }

-void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
+template<class Scalar>
+void OpenclKernels<Scalar>::scale(cl::Buffer& in, const Scalar a, int N)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
    }
 }

-void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
+template<class Scalar>
+void OpenclKernels<Scalar>::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c
    }
 }

-void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
-                           const double omega, const double beta, int N)
+template<class Scalar>
+void OpenclKernels<Scalar>::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
+                                   const Scalar omega, const Scalar beta, int N)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
    }
 }

-void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
+template<class Scalar>
+void OpenclKernels<Scalar>::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B
    }
 }

-void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
+template<class Scalar>
+void OpenclKernels<Scalar>::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
    }
 }

-void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
+template<class Scalar>
+void OpenclKernels<Scalar>::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
    }
 }

-void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
-                         const cl::Buffer& x, cl::Buffer& b, int Nb,
-                         unsigned int block_size, bool reset, bool add)
+template<class Scalar>
+void OpenclKernels<Scalar>::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
+                                 const cl::Buffer& x, cl::Buffer& b, int Nb,
+                                 unsigned int block_size, bool reset, bool add)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
    Timer t_spmv;
    cl::Event event;

    if (block_size > 1) {
        if (add) {
            event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                        vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+                                          vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
        } else {
            event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                        vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
+                                      vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
        }
    } else {
        if (reset) {
            event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                        vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
+                              vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
        } else {
            event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                        vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
+                                      vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
        }
    }

@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
    }
 }

-void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
-                            cl::Buffer& x, const cl::Buffer& rhs,
-                            cl::Buffer& out, int Nb, unsigned int block_size)
+template<class Scalar>
+void OpenclKernels<Scalar>::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
+                                     cl::Buffer& x, const cl::Buffer& rhs,
+                                     cl::Buffer& out, int Nb, unsigned int block_size)
 {
    const unsigned int work_group_size = 32;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
    Timer t_residual;
    cl::Event event;

    if (block_size > 1) {
        event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                    vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
+                                      vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
    } else {
        event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                    vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
+                              vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
    }

    if (verbosity >= 4) {
@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
    }
 }

-void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
-                               cl::Buffer& rows, cl::Buffer& diagIndex,
-                               const cl::Buffer& y, cl::Buffer& x,
-                               cl::Buffer& rowsPerColor, int color,
-                               int rowsThisColor, unsigned int block_size)
+template<class Scalar>
+void OpenclKernels<Scalar>::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
+                                       cl::Buffer& rows, cl::Buffer& diagIndex,
+                                       const cl::Buffer& y, cl::Buffer& x,
+                                       cl::Buffer& rowsPerColor, int color,
+                                       int rowsThisColor, unsigned int block_size)
 {
    const unsigned int work_group_size = preferred_workgroup_size_multiple;
    const unsigned int num_work_groups = rowsThisColor;
    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
    Timer t_ilu_apply1;

    cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          rowIndices, vals, cols, rows, diagIndex,
-                          y, x, rowsPerColor, color, block_size,
-                          cl::Local(lmem_per_work_group));
+                                      rowIndices, vals, cols, rows, diagIndex,
+                                      y, x, rowsPerColor, color, block_size,
+                                      cl::Local(lmem_per_work_group));

    if (verbosity >= 5) {
        event.wait();
@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
    }
 }

-void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
-                               cl::Buffer& rows, cl::Buffer& diagIndex,
-                               cl::Buffer& invDiagVals, cl::Buffer& x,
-                               cl::Buffer& rowsPerColor, int color,
-                               int rowsThisColor, unsigned int block_size)
+template<class Scalar>
+void OpenclKernels<Scalar>::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
+                                       cl::Buffer& rows, cl::Buffer& diagIndex,
+                                       cl::Buffer& invDiagVals, cl::Buffer& x,
+                                       cl::Buffer& rowsPerColor, int color,
+                                       int rowsThisColor, unsigned int block_size)
 {
    const unsigned int work_group_size = preferred_workgroup_size_multiple;
    const unsigned int num_work_groups = rowsThisColor;
    const unsigned int total_work_items = num_work_groups * work_group_size;
-    const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
+    const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
    Timer t_ilu_apply2;

    cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          rowIndices, vals, cols, rows, diagIndex,
-                          invDiagVals, x, rowsPerColor, color, block_size,
-                          cl::Local(lmem_per_work_group));
+                                      rowIndices, vals, cols, rows, diagIndex,
+                                      invDiagVals, x, rowsPerColor, color, block_size,
+                                      cl::Local(lmem_per_work_group));

    if (verbosity >= 5) {
        event.wait();
@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
    }
 }

-void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
-                               cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
-                               cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
-                               int rowsThisColor, unsigned int block_size)
+template<class Scalar>
+void OpenclKernels<Scalar>::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
+                                       cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
+                                       cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
+                                       int rowsThisColor, unsigned int block_size)
 {
    const unsigned int work_group_size = 128;
    const unsigned int num_work_groups = rowsThisColor;
    const unsigned int total_work_items = num_work_groups * work_group_size;
    const unsigned int num_hwarps_per_group = work_group_size / 16;
-    const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double);           // each block needs a pivot
+    const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar);           // each block needs a pivot
    Timer t_ilu_decomp;

    cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          firstRow, lastRow, rowIndices,
-                          vals, cols, rows,
-                          invDiagVals, diagIndex, rowsThisColor,
-                          cl::Local(lmem_per_work_group));
+                                      firstRow, lastRow, rowIndices,
+                                      vals, cols, rows,
+                                      invDiagVals, diagIndex, rowsThisColor,
+                                      cl::Local(lmem_per_work_group));

    if (verbosity >= 4) {
        event.wait();
@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
    }
 }

-void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
-    cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
-    int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
+template<class Scalar>
+void OpenclKernels<Scalar>::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
+                                           cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
+                                           int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
 {
    const unsigned int work_group_size = 32;
    const unsigned int total_work_items = num_std_wells * work_group_size;
-    const unsigned int lmem1 = sizeof(double) * work_group_size;
-    const unsigned int lmem2 = sizeof(double) * dim_wells;
+    const unsigned int lmem1 = sizeof(Scalar) * work_group_size;
+    const unsigned int lmem2 = sizeof(Scalar) * dim_wells;
    Timer t_apply_stdwells;

    cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
-                          cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
+                                         d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
+                                         cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));

    if (verbosity >= 4) {
        event.wait();
@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_
    }
 }

-void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
-    cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
+template<class Scalar>
+void OpenclKernels<Scalar>::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
+                                  cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
 {
    const unsigned int work_group_size = 256;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu

    Timer t_isaiL;
    cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-            diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
+                                 diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);

    if (verbosity >= 4) {
        event.wait();
@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
    }
 }

-void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
-        cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
-        cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
+template<class Scalar>
+void OpenclKernels<Scalar>::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
+                                  cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
+                                  cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
 {
    const unsigned int work_group_size = 256;
    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu

    Timer t_isaiU;
    cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-            diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
+                                 diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);

    if (verbosity >= 4) {
        event.wait();
@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
    }
 }

-} // namespace Accelerator
-} // namespace Opm
+template class OpenclKernels<double>;
+
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.hpp
@ -26,10 +26,7 @@

 #include <opm/simulators/linalg/bda/opencl/opencl.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
                                         const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer
 using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;

+template<class Scalar>
 class OpenclKernels
 {
 private:
    static int verbosity;
    static cl::CommandQueue *queue;
-    static std::vector<double> tmp;     // used as tmp CPU buffer for dot() and norm()
+    static std::vector<Scalar> tmp;     // used as tmp CPU buffer for dot() and norm()
    static bool initialized;
    static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE

    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
-    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
-    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
-    static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
-    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > axpy_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > scale_k;
+    static std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
+    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > custom_k;
    static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
    static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
@ -117,12 +115,12 @@ public:

    static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);

-    static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
-    static double norm(cl::Buffer& in, cl::Buffer& out, int N);
-    static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
-    static void scale(cl::Buffer& in, const double a, int N);
-    static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
-    static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
+    static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
+    static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N);
+    static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N);
+    static void scale(cl::Buffer& in, const Scalar a, int N);
+    static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
+    static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N);
    static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
    static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
    static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
@ -150,7 +148,40 @@ public:
            cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
 };

-} // namespace Accelerator
-} // namespace Opm
+#if CHOW_PATEL
+#define DECLARE_ILU(T)                                             \
+    template<> const std::string OpenclKernels<T>::ILU_apply1_str; \
+    template<> const std::string OpenclKernels<T>::ILU_apply2_str;
+#else
+#define DECLARE_ILU(T)                                                \
+    template<> const std::string OpenclKernels<T>::ILU_apply1_fm_str; \
+    template<> const std::string OpenclKernels<T>::ILU_apply2_fm_str;
+#endif
+
+#define DECLARE_INSTANCE(T)                                                            \
+    DECLARE_ILU(T)                                                                     \
+    template<> const std::string OpenclKernels<T>::axpy_str;                           \
+    template<> const std::string OpenclKernels<T>::scale_str;                          \
+    template<> const std::string OpenclKernels<T>::vmul_str;                           \
+    template<> const std::string OpenclKernels<T>::dot_1_str;                          \
+    template<> const std::string OpenclKernels<T>::norm_str;                           \
+    template<> const std::string OpenclKernels<T>::custom_str;                         \
+    template<> const std::string OpenclKernels<T>::full_to_pressure_restriction_str;   \
+    template<> const std::string OpenclKernels<T>::add_coarse_pressure_correction_str; \
+    template<> const std::string OpenclKernels<T>::prolongate_vector_str;              \
+    template<> const std::string OpenclKernels<T>::spmv_blocked_str;                   \
+    template<> const std::string OpenclKernels<T>::spmv_blocked_add_str;               \
+    template<> const std::string OpenclKernels<T>::spmv_str;                           \
+    template<> const std::string OpenclKernels<T>::spmv_noreset_str;                   \
+    template<> const std::string OpenclKernels<T>::residual_blocked_str;               \
+    template<> const std::string OpenclKernels<T>::residual_str;                       \
+    template<> const std::string OpenclKernels<T>::stdwell_apply_str;                  \
+    template<> const std::string OpenclKernels<T>::ILU_decomp_str;                     \
+    template<> const std::string OpenclKernels<T>::isaiL_str;                          \
+    template<> const std::string OpenclKernels<T>::isaiU_str;
+
+DECLARE_INSTANCE(double)
+
+} // namespace Opm::Accelerator

 #endif
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@ -37,41 +37,50 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, bool opencl_ilu_parallel_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_parallel(opencl_ilu_parallel_) {
-
+template<class Scalar, unsigned int block_size>
+openclSolverBackend<Scalar,block_size>::
+openclSolverBackend(int verbosity_,
+                    int maxit_,
+                    Scalar tolerance_,
+                    unsigned int platformID_,
+                    unsigned int deviceID_,
+                    bool opencl_ilu_parallel_,
+                    std::string linsolver)
+    : Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
+    , opencl_ilu_parallel(opencl_ilu_parallel_)
+{
    bool use_cpr, use_isai;

-    if (linsolver.compare("ilu0") == 0) {
+    if (linsolver == "ilu0") {
        use_cpr = false;
        use_isai = false;
-    } else if (linsolver.compare("cpr_quasiimpes") == 0) {
+    } else if (linsolver == "cpr_quasiimpes") {
        use_cpr = true;
        use_isai = false;
-    } else if (linsolver.compare("isai") == 0) {
+    } else if (linsolver == "isai") {
        use_cpr = false;
        use_isai = true;
-    } else if (linsolver.compare("cpr_trueimpes") == 0) {
-        OPM_THROW(std::logic_error, "Error openclSolver does not support --linerar-solver=cpr_trueimpes");
+    } else if (linsolver == "cpr_trueimpes") {
+        OPM_THROW(std::logic_error, "Error openclSolver does not support "
+                                    "--linear-solver=cpr_trueimpes");
    } else {
        OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
    }

-    using PreconditionerType = typename Preconditioner<block_size>::Type;
+    using PreconditionerType = Preconditioner<Scalar,block_size>;
    if (use_cpr) {
-        prec = Preconditioner<block_size>::create(PreconditionerType::CPR, opencl_ilu_parallel, verbosity);
+        prec = PreconditionerType::create(PreconditionerType::Type::CPR,
+                                          opencl_ilu_parallel, verbosity);
    } else if (use_isai) {
-        prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, opencl_ilu_parallel, verbosity);
+        prec = PreconditionerType::create(PreconditionerType::Type::BISAI,
+                                          opencl_ilu_parallel, verbosity);
    } else {
-        prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, opencl_ilu_parallel, verbosity);
+        prec = PreconditionerType::create(PreconditionerType::Type::BILU0,
+                                          opencl_ilu_parallel, verbosity);
    }

    std::ostringstream out;
@ -103,7 +112,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
        out.clear();

        if (platforms.size() <= platformID) {
-            OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
+            OPM_THROW(std::logic_error, "Error: Invalid OpenCL platform ID selected");
        } else {
            std::string platform_info;
            out << "Chosen:\n";
@ -119,7 +128,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
        platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);

        if (devices.empty()) {
-            OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
+            OPM_THROW(std::logic_error, "Error openclSolver is selected but "
+                                        "no OpenCL devices are found");
        }
        out << "Found " << devices.size() << " OpenCL devices" << "\n";

@ -203,8 +213,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
        context = std::make_shared<cl::Context>(devices[0]);
        queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));

-        OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
-
+        OpenclKernels<Scalar>::init(context.get(), queue.get(), devices, verbosity);
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -217,26 +226,33 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
    }
 }

-template <unsigned int block_size>
-openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, bool opencl_ilu_parallel_) :
-    BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_parallel(opencl_ilu_parallel_)
+template<class Scalar, unsigned int block_size>
+openclSolverBackend<Scalar,block_size>::
+openclSolverBackend(int verbosity_, int maxit_,
+                    Scalar tolerance_, bool opencl_ilu_parallel_)
+    : Base(verbosity_, maxit_, tolerance_)
+    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
    // prec = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
    // cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
 }

-template <unsigned int block_size>
-void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+setOpencl(std::shared_ptr<cl::Context>& context_,
+          std::shared_ptr<cl::CommandQueue>& queue_)
+{
    context = context_;
    queue = queue_;
 }

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
+{
    float it;
-    double rho, rhop, beta, alpha, omega, tmp1, tmp2;
-    double norm, norm_0;
+    Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
+    Scalar norm, norm_0;

    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);

@ -246,15 +262,15 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

    // set initial values
    events.resize(5);
-    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
-    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
+    queue->enqueueFillBuffer(d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[0]);
+    queue->enqueueFillBuffer(d_v, 0, 0, sizeof(Scalar) * N, nullptr, &events[1]);
    rho = 1.0;
    alpha = 1.0;
    omega = 1.0;

-    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
-    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
-    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
+    queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
+    queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(Scalar) * N, nullptr, &events[3]);
+    queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);

    cl::WaitForEvents(events);
    events.clear();
@ -263,7 +279,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
    }

-    norm = OpenclKernels::norm(d_r, d_tmp, N);
+    norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
    norm_0 = norm;

    if (verbosity > 1) {
@ -277,11 +293,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    }
    for (it = 0.5; it < maxit; it += 0.5) {
        rhop = rho;
-        rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
+        rho = OpenclKernels<Scalar>::dot(d_rw, d_r, d_tmp, N);

        if (it > 1) {
            beta = (rho / rhop) * (alpha / omega);
-            OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
+            OpenclKernels<Scalar>::custom(d_p, d_v, d_r, omega, beta, N);
        }
        if (verbosity >= 3) {
            queue->finish();
@ -298,7 +314,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        }

        // v = A * pw
-        OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
+        OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
        if (verbosity >= 3) {
            queue->finish();
            t_spmv.stop();
@ -306,20 +322,20 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        }

        // apply wellContributions
-        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v);
+        if (wellContribs.getNumWells() > 0) {
+            static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_pw, d_v);
        }
-        if(verbosity >= 3) {
+        if (verbosity >= 3) {
            queue->finish();
            t_well.stop();
            t_rest.start();
        }

-        tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
+        tmp1 = OpenclKernels<Scalar>::dot(d_rw, d_v, d_tmp, N);
        alpha = rho / tmp1;
-        OpenclKernels::axpy(d_v, -alpha, d_r, N);      // r = r - alpha * v
-        OpenclKernels::axpy(d_pw, alpha, d_x, N);      // x = x + alpha * pw
-        norm = OpenclKernels::norm(d_r, d_tmp, N);
+        OpenclKernels<Scalar>::axpy(d_v, -alpha, d_r, N);      // r = r - alpha * v
+        OpenclKernels<Scalar>::axpy(d_pw, alpha, d_x, N);      // x = x + alpha * pw
+        norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
        if (verbosity >= 3) {
            queue->finish();
            t_rest.stop();
@ -343,8 +359,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        }

        // t = A * s
-        OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
-        if(verbosity >= 3){
+        OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
+        if (verbosity >= 3) {
            queue->finish();
            t_spmv.stop();
            t_well.start();
@ -352,7 +368,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

        // apply wellContributions
        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_s, d_t);
        }
        if (verbosity >= 3) {
            queue->finish();
@ -360,12 +376,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
            t_rest.start();
        }

-        tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
-        tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
+        tmp1 = OpenclKernels<Scalar>::dot(d_t, d_r, d_tmp, N);
+        tmp2 = OpenclKernels<Scalar>::dot(d_t, d_t, d_tmp, N);
        omega = tmp1 / tmp2;
-        OpenclKernels::axpy(d_s, omega, d_x, N);     // x = x + omega * s
-        OpenclKernels::axpy(d_t, -omega, d_r, N);    // r = r - omega * t
-        norm = OpenclKernels::norm(d_r, d_tmp, N);
+        OpenclKernels<Scalar>::axpy(d_s, omega, d_x, N);     // x = x + omega * s
+        OpenclKernels<Scalar>::axpy(d_t, -omega, d_r, N);    // r = r - omega * t
+        norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
        if (verbosity >= 3) {
            queue->finish();
            t_rest.stop();
@ -382,7 +398,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
        }
    }

-    res.iterations = std::min(it, (float)maxit);
+    res.iterations = std::min(it, static_cast<float>(maxit));
    res.reduction = norm / norm_0;
    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / it));
    res.elapsed = t_total.stop();
@ -390,7 +406,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr

    if (verbosity > 0) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+        out << "=== converged: " << res.converged << ", conv_rate: "
+            << res.conv_rate << ", time: " << res.elapsed <<
            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
    }
@ -405,9 +422,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
    }
 }

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -437,22 +456,21 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
        mat = matrix;
        jacMat = jacMatrix;

-        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
-        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
+        d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
+        d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);

-        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
+        d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * nnz);
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
-
    } catch (const cl::Error& error) {
        std::ostringstream oss;
        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -467,8 +485,10 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
    initialized = true;
 } // end initialize()

-template <unsigned int block_size>
-void openclSolverBackend<block_size>::copy_system_to_gpu() {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+copy_system_to_gpu()
+{
    Timer t;
    events.resize(5);

@ -476,18 +496,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
-        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, vals_contiguous.data(),
+                                    nullptr, &events[0]);
 #else
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
 #endif

-    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
-    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
-    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
-    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
+    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0,
+                                     sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0,
+                                     sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
+                                     sizeof(Scalar) * N, h_b, nullptr, &events[3]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);

    cl::WaitForEvents(events);
    events.clear();
@ -504,8 +531,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
 } // end copy_system_to_gpu()

 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system_on_gpu() {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+update_system_on_gpu()
+{
    Timer t;
    events.resize(3);

@ -513,16 +542,21 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
-        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, vals_contiguous.data(),
+                                    nullptr, &events[0]);
 #else
-    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
+    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
+                                    sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
 #endif

-    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
-    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
+                                     sizeof(Scalar) * N, h_b, nullptr, &events[1]);
+    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);

    cl::WaitForEvents(events);
    events.clear();
@ -538,9 +572,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
    }
 } // end update_system_on_gpu()

-
-template <unsigned int block_size>
-bool openclSolverBackend<block_size>::analyze_matrix() {
+template<class Scalar, unsigned int block_size>
+bool openclSolverBackend<Scalar,block_size>::
+analyze_matrix()
+{
    Timer t;

    bool success;
@ -560,9 +595,10 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
    return success;
 } // end analyze_matrix()

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+update_system(Scalar* vals, Scalar* b)
+{
    Timer t;

    mat->nnzValues = vals;
@ -575,9 +611,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
    }
 } // end update_system()

-
-template <unsigned int block_size>
-bool openclSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool openclSolverBackend<Scalar,block_size>::
+create_preconditioner()
+{
    Timer t;

    bool result;
@ -594,9 +631,10 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
    return result;
 } // end create_preconditioner()

-
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
+{
    Timer t;

    // actually solve
@ -604,7 +642,8 @@ void openclSolverBackend<block_size>::solve_system(WellContributions &wellContri
        gpu_pbicgstab(wellContribs, res);
    } catch (const cl::Error& error) {
        std::ostringstream oss;
-        oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
+        oss << "openclSolverBackend::solve_system error: " << error.what()
+            << "(" << error.err() << ")\n";
        oss << getErrorString(error.err());
        // rethrow exception
        OPM_THROW(std::logic_error, oss.str());
@ -618,17 +657,17 @@ void openclSolverBackend<block_size>::solve_system(WellContributions &wellContri
        out << "openclSolver::solve_system(): " << t.stop() << " s";
        OpmLog::info(out.str());
    }
-
 } // end solve_system()

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void openclSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void openclSolverBackend<Scalar,block_size>::
+get_result(Scalar* x)
+{
    Timer t;

-    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
+    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(Scalar) * N, x);

    if (verbosity > 2) {
        std::ostringstream out;
@ -637,13 +676,13 @@ void openclSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
-template <unsigned int block_size>
-SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                           double *b,
-                                                           std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                           WellContributions& wellContribs,
-                                                           BdaResult &res)
+template<class Scalar, unsigned int block_size>
+SolverStatus openclSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix, jacMatrix);
@ -668,21 +707,14 @@ SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<Block
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

+#define INSTANTIATE_TYPE(T)                  \
+    template class openclSolverBackend<T,1>; \
+    template class openclSolverBackend<T,2>; \
+    template class openclSolverBackend<T,3>; \
+    template class openclSolverBackend<T,4>; \
+    template class openclSolverBackend<T,5>; \
+    template class openclSolverBackend<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
-template openclSolverBackend<n>::openclSolverBackend(                       \
-    int, int, double, unsigned int, unsigned int, bool, std::string); \
-template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
-template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@ -27,16 +27,13 @@

 #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a opencl-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class openclSolverBackend : public BdaSolver<block_size>
+template<class Scalar, unsigned int block_size>
+class openclSolverBackend : public BdaSolver<Scalar,block_size>
 {
-    typedef BdaSolver<block_size> Base;
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -50,8 +47,8 @@ class openclSolverBackend : public BdaSolver<block_size>
    using Base::initialized;

 private:
-    double *h_b = nullptr;                // b vector, on host
-    std::vector<double> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
+    Scalar* h_b = nullptr;                // b vector, on host
+    std::vector<Scalar> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp

    // OpenCL variables must be reusable, they are initialized in initialize()
    cl::Buffer d_Avals, d_Acols, d_Arows;        // matrix in BSR format on GPU
@ -63,12 +60,12 @@ private:

    bool useJacMatrix = false;

-    std::unique_ptr<Preconditioner<block_size> > prec;
+    std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
                                                                  // can perform blocked ILU0 and AMG on pressure component
    bool is_root;                                                 // allow for nested solvers, the root solver is called by BdaBridge
    bool analysis_done = false;
-    std::shared_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
-    std::shared_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
+    std::shared_ptr<BlockedMatrix<Scalar>> mat{};                 // original matrix
+    std::shared_ptr<BlockedMatrix<Scalar>> jacMat{};              // matrix for preconditioner
    bool opencl_ilu_parallel;                                     // parallelize ILU operations (with level_scheduling)
    std::vector<cl::Event> events;
    cl_int err;
@ -76,12 +73,13 @@ private:
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
    /// \param[in] matrix     matrix A
    /// \param[in] jacMatrix  matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Copy linear system to GPU
    void copy_system_to_gpu();
@ -89,7 +87,7 @@ private:
    /// Reassign pointers, in case the addresses of the Dune variables have changed
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] b              input vector b, contains N values
-    void update_system(double *vals, double *b);
+    void update_system(Scalar* vals, Scalar* b);

    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    void update_system_on_gpu();
@ -106,11 +104,11 @@ private:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    ///                           could be empty
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions &wellContribs, BdaResult &res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);

 public:
-    std::shared_ptr<cl::Context> context;
-    std::shared_ptr<cl::CommandQueue> queue;
+    std::shared_ptr<cl::Context> context{};
+    std::shared_ptr<cl::CommandQueue> queue{};

    /// Construct a openclSolver
    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
@ -121,11 +119,13 @@ public:
    /// \param[in] opencl_ilu_parallel        whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
    /// \param[in] linsolver                  indicating the preconditioner, equal to the --linear-solver cmdline argument
    ///                                       only ilu0, cpr_quasiimpes and isai are supported
-    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
-        bool opencl_ilu_parallel, std::string linsolver);
+    openclSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance,
+                        unsigned int platformID, unsigned int deviceID,
+                        bool opencl_ilu_parallel, std::string linsolver);

    /// For the CPR coarse solver
-    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
+    openclSolverBackend(int linear_solver_verbosity, int maxit,
+                        Scalar tolerance, bool opencl_ilu_parallel);

    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
    /// \param[in] matrix         matrix A
@ -134,8 +134,11 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
+                              BdaResult& res) override;

    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
    /// Data is already on the GPU
@ -143,19 +146,16 @@ public:

    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;

    /// Set OpenCL objects
    /// This class either creates them based on platformID and deviceID or receives them through this function
    /// \param[in] context   the opencl context to be used
    /// \param[in] queue     the opencl queue to be used
-    void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
-    
+    void setOpencl(std::shared_ptr<cl::Context>& context,
+                   std::shared_ptr<cl::CommandQueue>& queue);
 }; // end class openclSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
-
-
--- a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp
@ -25,93 +25,122 @@

 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>

-namespace Opm
-{
+namespace Opm {

 using Accelerator::OpenclKernels;

-void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
+template<class Scalar>
+void WellContributionsOCL<Scalar>::
+setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_)
+{
    this->context = context_;
    this->queue = queue_;
 }

-
-void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
-    OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
-        d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
+template<class Scalar>
+void WellContributionsOCL<Scalar>::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y)
+{
+    OpenclKernels<Scalar>::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl,
+                                          *d_Ccols_ocl, *d_Bcols_ocl,
+                                          d_x, d_y, this->dim, this->dim_wells,
+                                          *d_val_pointers_ocl, this->num_std_wells);
 }

-void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
+template<class Scalar>
+void WellContributionsOCL<Scalar>::apply_mswells(cl::Buffer d_x, cl::Buffer d_y)
+{
    if (h_x.empty()) {
-        h_x.resize(N);
-        h_y.resize(N);
+        h_x.resize(this->N);
+        h_y.resize(this->N);
    }

    events.resize(2);
-    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
-    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
+    queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(Scalar) * this->N,
+                             h_x.data(), nullptr, &events[0]);
+    queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(Scalar) * this->N,
+                             h_y.data(), nullptr, &events[1]);
    cl::WaitForEvents(events);
    events.clear();

    // actually apply MultisegmentWells
-    for (auto& well : multisegments) {
+    for (auto& well : this->multisegments) {
        well->apply(h_x.data(), h_y.data());
    }

    // copy vector y from CPU to GPU
    events.resize(1);
-    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
+    queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(Scalar) * this->N,
+                              h_y.data(), nullptr, &events[0]);
    events[0].wait();
    events.clear();
 }

-void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y){
-    if(num_std_wells > 0){
+template<class Scalar>
+void WellContributionsOCL<Scalar>::apply(cl::Buffer d_x, cl::Buffer d_y)
+{
+    if (this->num_std_wells > 0){
        apply_stdwells(d_x, d_y);
    }

-    if(num_ms_wells > 0){
+    if (this->num_ms_wells > 0) {
        apply_mswells(d_x, d_y);
    }
 }

-void WellContributionsOCL::APIaddMatrix(MatrixType type,
-                                        int* colIndices,
-                                        double* values,
-                                        unsigned int val_size)
+template<class Scalar>
+void WellContributionsOCL<Scalar>::
+APIaddMatrix(MatrixType type,
+             int* colIndices,
+             Scalar* values,
+             unsigned int val_size)
 {
-    if (!allocated) {
+    if (!this->allocated) {
        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

    switch (type) {
    case MatrixType::C:
        events.resize(2);
-        queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
-        queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+        queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE,
+                                  sizeof(Scalar) * this->num_blocks_so_far * this->dim * this->dim_wells,
+                                  sizeof(Scalar) * val_size * this->dim * this->dim_wells,
+                                  values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE,
+                                  sizeof(int) * this->num_blocks_so_far,
+                                  sizeof(int) * val_size, colIndices, nullptr, &events[1]);
        cl::WaitForEvents(events);
        events.clear();
        break;

    case MatrixType::D:
        events.resize(1);
-        queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE,
+                                  sizeof(Scalar) * this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
+                                  sizeof(Scalar) * this->dim_wells * this->dim_wells,
+                                  values, nullptr, &events[0]);
        events[0].wait();
        events.clear();
        break;

    case MatrixType::B:
        events.resize(2);
-        queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
-        queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
+        queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE,
+                                  sizeof(Scalar) * this->num_blocks_so_far * this->dim * this->dim_wells,
+                                  sizeof(Scalar) * val_size * this->dim * this->dim_wells,
+                                  values, nullptr, &events[0]);
+        queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE,
+                                  sizeof(int) * this->num_blocks_so_far, sizeof(int) * val_size,
+                                  colIndices, nullptr, &events[1]);
        cl::WaitForEvents(events);
        events.clear();

-        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-        if (num_std_wells_so_far == num_std_wells - 1) {
-            val_pointers[num_std_wells] = num_blocks;
+        this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
+        if (this->num_std_wells_so_far == this->num_std_wells - 1) {
+            this->val_pointers[this->num_std_wells] = this->num_blocks;
            events.resize(1);
-            queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
+            queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0,
+                                      sizeof(unsigned int) * (this->num_std_wells + 1),
+                                      this->val_pointers.data(), nullptr, &events[0]);
            events[0].wait();
            events.clear();
        }
@ -122,14 +151,21 @@ void WellContributionsOCL::APIaddMatrix(MatrixType type,
    }
 }

-void WellContributionsOCL::APIalloc()
+template<class Scalar>
+void WellContributionsOCL<Scalar>::APIalloc()
 {
-    d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-    d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-    d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
-    d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-    d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
-    d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
+    d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
+                                               sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
+    d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
+                                               sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
+    d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
+                                               sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
+    d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * this->num_blocks);
+    d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * this->num_blocks);
+    d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
+                                                      sizeof(unsigned int) * (this->num_std_wells + 1));
 }

-} //namespace Opm
+template class WellContributionsOCL<double>;
+
+} // namespace Opm
--- a/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
@ -29,10 +29,10 @@
 #include <vector>


-namespace Opm
-{
+namespace Opm {

-class WellContributionsOCL : public WellContributions
+template<class Scalar>
+class WellContributionsOCL : public WellContributions<Scalar>
 {
 public:
    void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
@ -45,7 +45,10 @@ protected:
    /// Allocate memory for the StandardWells
    void APIalloc() override;

-    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+    using MatrixType = typename WellContributions<Scalar>::MatrixType;
+
+    void APIaddMatrix(MatrixType type, int* colIndices,
+                      Scalar* values, unsigned int val_size) override;

    cl::Context* context;
    cl::CommandQueue* queue;
@ -55,10 +58,10 @@ protected:
    std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
    std::unique_ptr<cl::Buffer> d_val_pointers_ocl;

-    std::vector<double> h_x;
-    std::vector<double> h_y;
+    std::vector<Scalar> h_x;
+    std::vector<Scalar> h_y;
 };

-} //namespace Opm
+} // namespace Opm

 #endif
--- a/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp
@ -47,27 +47,28 @@
 #undef HIP_HAVE_CUDA_DEFINED
 #endif

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-rocalutionSolverBackend<block_size>::rocalutionSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
+template<class Scalar, unsigned int block_size>
+rocalutionSolverBackend<Scalar,block_size>::
+rocalutionSolverBackend(int verbosity_, int maxit_, Scalar tolerance_)
+    : Base(verbosity_, maxit_, tolerance_)
+{
    rocalution::init_rocalution();
    rocalution::info_rocalution();
-    roc_solver = std::make_unique<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> >();
-    roc_prec = std::make_unique<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> >();
+    using BCGS = rocalution::BiCGStab<Mat,Vec,Scalar>;
+    roc_solver = std::make_unique<BCGS>();
+    using ILU = rocalution::ILU<Mat,Vec,Scalar>;
+    roc_prec = std::make_unique<ILU>();
    roc_solver->Verbose(0);
    roc_solver->Init(/*abs_tol=*/1e-15, tolerance, /*divergence_tol=*/1e3, maxit);
 }

-
-template <unsigned int block_size>
-rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
+template<class Scalar, unsigned int block_size>
+rocalutionSolverBackend<Scalar,block_size>::~rocalutionSolverBackend()
+{
    // normally, these rocalution variables are destroyed after the destructor automatically,
    // but sometimes it segfaults, both with test_rocalutionSolver and with an actual case
    // release both variables here to prevent that segfault
@ -76,9 +77,10 @@ rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
    rocalution::stop_rocalution();
 }

-
-template <unsigned int block_size>
-void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
+template<class Scalar, unsigned int block_size>
+void rocalutionSolverBackend<Scalar,block_size>::
+initialize(BlockedMatrix<Scalar>* matrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -94,15 +96,16 @@ void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
    initialized = true;
 } // end initialize()

-
-template <unsigned int block_size>
-void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix) {
+template<class Scalar, unsigned int block_size>
+void rocalutionSolverBackend<Scalar,block_size>::
+convert_matrix(BlockedMatrix<Scalar>* matrix)
+{
    Timer t;

-    for(int i = 0; i < Nb+1; ++i){
+    for (int i = 0; i < Nb+1; ++i) {
        tmp_rowpointers[i] = matrix->rowPointers[i];
    }
-    for(int i = 0; i < nnzb; ++i){
+    for (int i = 0; i < nnzb; ++i) {
        tmp_colindices[i] = matrix->colIndices[i];
    }

@ -112,7 +115,7 @@ void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix)
    // BCSR_IND_BASE == 0: rocalution expects column-major
    // BCSR_IND_BASE == 1: rocalution expects row-major
    if (BCSR_IND_BASE == 0) {
-        for(int i = 0; i < nnzb; ++i){
+        for (int i = 0; i < nnzb; ++i) {
            tmp_nnzvalues[i * block_size * block_size + 0] = matrix->nnzValues[i * block_size * block_size + 0];
            tmp_nnzvalues[i * block_size * block_size + 1] = matrix->nnzValues[i * block_size * block_size + 3];
            tmp_nnzvalues[i * block_size * block_size + 2] = matrix->nnzValues[i * block_size * block_size + 6];
@ -131,11 +134,12 @@ void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix)
    }
 }

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void rocalutionSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void rocalutionSolverBackend<Scalar,block_size>::
+get_result(Scalar* x)
+{
    Timer t;

    std::copy(h_x.begin(), h_x.end(), x);
@ -147,13 +151,13 @@ void rocalutionSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
-template <unsigned int block_size>
-SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                           double *b,
-                                                           [[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                           [[maybe_unused]] WellContributions& wellContribs,
-                                                           BdaResult &res)
+template<class Scalar, unsigned int block_size>
+SolverStatus rocalutionSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             [[maybe_unused]] std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             [[maybe_unused]] WellContributions<Scalar>& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix.get());
@ -161,21 +165,20 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B

    tmp_rowpointers = new int[Nb+1];
    tmp_colindices = new int[nnzb];
-    tmp_nnzvalues = new double[nnzb*block_size*block_size];
+    tmp_nnzvalues = new Scalar[nnzb*block_size*block_size];

    convert_matrix(matrix.get());

-    rocalution::LocalVector<double> roc_x;
-    rocalution::LocalVector<double> roc_rhs;
-    rocalution::LocalMatrix<double> roc_mat;
+    Vec roc_x;
+    Vec roc_rhs;
+    Mat roc_mat;

    // this also transfers ownership to the allocated memory to rocalution
    // and sets the tmp_* pointers to nullptr
-    roc_mat.SetDataPtrBCSR(
-        &tmp_rowpointers,
-        &tmp_colindices,
-        &tmp_nnzvalues,
-        "matrix A", nnzb, Nb, Nb, block_size);
+    roc_mat.SetDataPtrBCSR(&tmp_rowpointers,
+                           &tmp_colindices,
+                           &tmp_nnzvalues,
+                           "matrix A", nnzb, Nb, Nb, block_size);

    roc_mat.MoveToAccelerator();
    roc_x.MoveToAccelerator();
@ -196,7 +199,7 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
    // so it just calls ILU::Build() everytime
    roc_solver->ReBuildNumeric();

-    double norm_0 = roc_rhs.Norm(); // since the initial guess is a vector with 0s, initial error is norm(b)
+    Scalar norm_0 = roc_rhs.Norm(); // since the initial guess is a vector with 0s, initial error is norm(b)

    // actually solve
    Dune::Timer t_solve;
@ -215,7 +218,6 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
    res.conv_rate  = static_cast<double>(pow(res.reduction, 1.0 / res.iterations));
    res.converged = (roc_solver->GetSolverStatus() == 2);

-
    // copy solution vector to host vector
    // if roc_x could be reused, this should be removed here
    // and roc_x should be directly copied into x in get_result()
@ -224,26 +226,25 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B

    if (verbosity >= 1) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / res.iterations << ", iterations: " << res.iterations;
+        out << "=== converged: " << res.converged
+            << ", conv_rate: " << res.conv_rate
+            << ", time: " << res.elapsed <<
+            ", time per iteration: " << res.elapsed / res.iterations
+            << ", iterations: " << res.iterations;
        OpmLog::info(out.str());
    }

    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

+#define INSTANTIATE_TYPE(T)                      \
+    template class rocalutionSolverBackend<T,1>; \
+    template class rocalutionSolverBackend<T,2>; \
+    template class rocalutionSolverBackend<T,3>; \
+    template class rocalutionSolverBackend<T,4>; \
+    template class rocalutionSolverBackend<T,5>; \
+    template class rocalutionSolverBackend<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n) \
-template rocalutionSolverBackend<n>::rocalutionSolverBackend(int, int, double);
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
@ -31,17 +31,14 @@ template<class Scalar> class LocalMatrix;
 template<class Scalar> class LocalVector;
 }

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a rocalution based linear solver solver on GPU
 /// It uses ilu0-bicgstab
-template <unsigned int block_size>
-class rocalutionSolverBackend : public BdaSolver<block_size>
+template<class Scalar, unsigned int block_size>
+class rocalutionSolverBackend : public BdaSolver<Scalar,block_size>
 {
-    typedef BdaSolver<block_size> Base;
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -55,31 +52,34 @@ class rocalutionSolverBackend : public BdaSolver<block_size>
    using Base::initialized;

 private:
-    std::vector<double> h_x; // store solution vector on host
+    std::vector<Scalar> h_x; // store solution vector on host
    int *tmp_rowpointers;    // store matrix on host, this pointer is given to and freed by rocalution
    int *tmp_colindices;     // store matrix on host, this pointer is given to and freed by rocalution
-    double *tmp_nnzvalues;   // store matrix on host, this pointer is given to and freed by rocalution
+    Scalar* tmp_nnzvalues;   // store matrix on host, this pointer is given to and freed by rocalution

-    std::unique_ptr<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_prec;
-    std::unique_ptr<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_solver;
+    using Mat = rocalution::LocalMatrix<Scalar>;
+    using Vec = rocalution::LocalVector<Scalar>;
+
+    std::unique_ptr<rocalution::ILU<Mat,Vec,Scalar>> roc_prec;
+    std::unique_ptr<rocalution::BiCGStab<Mat,Vec,Scalar>> roc_solver;

    /// Initialize sizes and allocate memory
    /// \param[in] matrix     matrix A
-    void initialize(BlockedMatrix *matrix);
+    void initialize(BlockedMatrix<Scalar>* matrix);

    /// Convert matrix to rocalution format
    /// copy matrix to raw pointers, which are given to and freed by rocalution
    /// \param[in] matrix     matrix A
-    void convert_matrix(BlockedMatrix *matrix);
+    void convert_matrix(BlockedMatrix<Scalar>* matrix);

 public:
-
    /// Construct a rocalutionSolver
    /// also initialize rocalution library and rocalution variables
    /// \param[in] linear_solver_verbosity    verbosity of rocalutionSolver
    /// \param[in] maxit                      maximum number of iterations for rocalutionSolver
    /// \param[in] tolerance                  required relative tolerance for rocalutionSolver
-    rocalutionSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
+    rocalutionSolverBackend(int linear_solver_verbosity,
+                            int maxit, Scalar tolerance);

    /// Destroy a rocalutionSolver, and free memory
    ~rocalutionSolverBackend();
@ -91,17 +91,19 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
+                              BdaResult& res) override;

    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;
    
 }; // end class rocalutionSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif

--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@ -93,20 +93,20 @@
 extern std::shared_ptr<std::thread> copyThread;
 #endif //HAVE_OPENMP

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

-template <unsigned int block_size>
-rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
+template<class Scalar, unsigned int block_size>
+rocsparseSolverBackend<Scalar,block_size>::
+rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
+                       unsigned int platformID_, unsigned int deviceID_)
+    : Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
+{
    int numDevices = 0;
    HIP_CHECK(hipGetDeviceCount(&numDevices));
    if (static_cast<int>(deviceID) >= numDevices) {
-        OPM_THROW(std::runtime_error, "Error chosen too high HIP device ID");
+        OPM_THROW(std::runtime_error, "Invalid HIP device ID");
    }
    HIP_CHECK(hipSetDevice(deviceID));

@ -126,45 +126,45 @@ rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int m
    ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
 }

-
-template <unsigned int block_size>
-rocsparseSolverBackend<block_size>::~rocsparseSolverBackend() {
+template<class Scalar, unsigned int block_size>
+rocsparseSolverBackend<Scalar,block_size>::~rocsparseSolverBackend()
+{
    hipError_t hipstatus = hipStreamSynchronize(stream);
-    if(hipstatus != hipSuccess){
+    if (hipstatus != hipSuccess) {
        OpmLog::error("Could not synchronize with hipStream");
    }
    hipstatus = hipStreamDestroy(stream);
-    if(hipstatus != hipSuccess){
+    if (hipstatus != hipSuccess) {
        OpmLog::error("Could not destroy hipStream");
    }
    rocsparse_status status1 = rocsparse_destroy_handle(handle);
-    if(status1 != rocsparse_status_success){
+    if (status1 != rocsparse_status_success) {
        OpmLog::error("Could not destroy rocsparse handle");
    }
    rocblas_status status2 = rocblas_destroy_handle(blas_handle);
-    if(status2 != rocblas_status_success){
+    if (status2 != rocblas_status_success) {
        OpmLog::error("Could not destroy rocblas handle");
    }
 }

-
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellContributions& wellContribs,
-                                                       BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
+              BdaResult& res)
 {
    float it = 0.5;
-    double rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
-    double norm, norm_0;
-    double zero = 0.0;
-    double one  = 1.0;
-    double mone = -1.0;
+    Scalar rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
+    Scalar norm, norm_0;
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;
+    Scalar mone = -1.0;

    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);

    // set stream here, the WellContributions object is destroyed every linear solve
    // the number of wells can change every linear solve
-    if(wellContribs.getNumWells() > 0){
-        static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
+    if (wellContribs.getNumWells() > 0) {
+        static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).setStream(stream);
    }

 // HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
@ -253,8 +253,8 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
        }

        // apply wellContributions
-        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
+        if (wellContribs.getNumWells() > 0) {
+            static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_pw, d_v);
        }
        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
@ -312,15 +312,15 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
                                            d_Avals, d_Arows, d_Acols, block_size,
                                            d_s, &zero, d_t));
 #endif
-        if(verbosity >= 3){
+        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
            t_spmv.stop();
            t_well.start();
        }

        // apply wellContributions
-        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
+        if (wellContribs.getNumWells() > 0) {
+            static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_s, d_t);
        }
        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
@ -360,8 +360,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont

    if (verbosity >= 1) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        out << "=== converged: " << res.converged
+            << ", conv_rate: " << res.conv_rate
+            << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / it
+            << ", iterations: " << it;
        OpmLog::info(out.str());
    }
    if (verbosity >= 3) {
@ -375,9 +378,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
    }
 }

-
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
+{
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
    this->nnzb = matrix->nnzbs;
@ -390,12 +395,14 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
    }

    std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
+    out << "Initializing GPU, matrix size: "
+        << Nb << " blockrows, nnzb: " << nnzb << "\n";
    if (useJacMatrix) {
        out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
    }
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
-    out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
+    out << "Maxit: " << maxit
+        << std::scientific << ", tolerance: " << tolerance << "\n"
+        << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
    OpmLog::info(out.str());
    out.str("");
    out.clear();
@ -403,26 +410,26 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
    mat = matrix;
    jacMat = jacMatrix;

-    HIP_CHECK(hipMalloc((void**)&d_r, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_p, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_s, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_v, sizeof(double) * N));
+    HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_p, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_s, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_v, sizeof(Scalar) * N));

    HIP_CHECK(hipMalloc((void**)&d_Arows, sizeof(rocsparse_int) * (Nb + 1)));
    HIP_CHECK(hipMalloc((void**)&d_Acols, sizeof(rocsparse_int) * nnzb));
-    HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(double) * nnz));
-    HIP_CHECK(hipMalloc((void**)&d_x, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_b, sizeof(double) * N));
+    HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(Scalar) * nnz));
+    HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));

    if (useJacMatrix) {
        HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
        HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
    } else { // preconditioner matrix is same
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
        d_Mcols = d_Acols;
        d_Mrows = d_Arows;
    }
@ -430,26 +437,43 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
    initialized = true;
 } // end initialize()

-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+copy_system_to_gpu(Scalar *b)
+{
    Timer t;

-    HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
-    HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers,
+                             sizeof(rocsparse_int) * (Nb + 1), 
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, 
+                             sizeof(rocsparse_int) * nnzb,
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues,
+                             sizeof(Scalar) * nnz,
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
+    HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
+                             hipMemcpyHostToDevice, stream));
    
    if (useJacMatrix) {
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+        if (omp_get_max_threads() > 1) {
+           copyThread->join();
+        }
 #endif
-        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
+                                 sizeof(rocsparse_int) * (Nb + 1),
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
+                                 sizeof(rocsparse_int) * nnzbs_prec,
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
+                                 sizeof(Scalar) * nnzbs_prec * block_size * block_size,
+                                 hipMemcpyHostToDevice, stream));
    } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
+                                 sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
    }

    if (verbosity >= 3) {
@ -459,29 +483,36 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
        std::ostringstream out;
        out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
        out << "---rocsparseSolver::cum copy: " << c_copy << " s";
-	OpmLog::info(out.str());
+        OpmLog::info(out.str());
    }
 } // end copy_system_to_gpu()

 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+update_system_on_gpu(Scalar* b)
+{
    Timer t;

-    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
-    HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(Scalar) * nnz,
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
+    HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
+                             hipMemcpyHostToDevice, stream));
    
    if (useJacMatrix) {
 #if HAVE_OPENMP
-	if (omp_get_max_threads() > 1)
-	    copyThread->join();
+        if (omp_get_max_threads() > 1) {
+            copyThread->join();
+        }
 #endif
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
+                                 sizeof(Scalar) * nnzbs_prec * block_size * block_size,
+                                 hipMemcpyHostToDevice, stream));
    } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
+                                 sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
    }
-    
    if (verbosity >= 3) {
        HIP_CHECK(hipStreamSynchronize(stream));

@ -493,8 +524,10 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
    }
 } // end update_system_on_gpu()

-template <unsigned int block_size>
-bool rocsparseSolverBackend<block_size>::analyze_matrix() {
+template<class Scalar, unsigned int block_size>
+bool rocsparseSolverBackend<Scalar,block_size>::
+analyze_matrix()
+{
    std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    Timer t;

@ -523,7 +556,8 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
    ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
                               descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));

-    d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+    d_bufferSize = std::max(d_bufferSize_M,
+                            std::max(d_bufferSize_L, d_bufferSize_U));

    HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));

@ -571,9 +605,10 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
    return true;
 } // end analyze_matrix()

-
-template <unsigned int block_size>
-bool rocsparseSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool rocsparseSolverBackend<Scalar,block_size>::
+create_preconditioner()
+{
    Timer t;

    bool result = true;
@ -598,9 +633,10 @@ bool rocsparseSolverBackend<block_size>::create_preconditioner() {
    return result;
 } // end create_preconditioner()

-
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
+{
    Timer t;

    // actually solve
@ -612,17 +648,18 @@ void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellCon
        out << "rocsparseSolver::solve_system(): " << t.stop() << " s";
        OpmLog::info(out.str());
    }
-
 } // end solve_system()

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+get_result(Scalar* x)
+{
    Timer t;

-    HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(Scalar) * N,
+                             hipMemcpyDeviceToHost, stream));
    HIP_CHECK(hipStreamSynchronize(stream)); // always wait, caller might want to use x immediately

    if (verbosity >= 3) {
@ -632,13 +669,13 @@ void rocsparseSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-
-template <unsigned int block_size>
-SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
-                                                              double *b,
-                                                              std::shared_ptr<BlockedMatrix> jacMatrix,
-                                                              WellContributions& wellContribs,
-                                                              BdaResult &res)
+template<class Scalar, unsigned int block_size>
+SolverStatus rocsparseSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
+             BdaResult& res)
 {
    if (initialized == false) {
        initialize(matrix, jacMatrix);
@ -662,19 +699,14 @@ SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<Bl
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

+#define INSTANTIATE_TYPE(T)                     \
+    template class rocsparseSolverBackend<T,1>; \
+    template class rocsparseSolverBackend<T,2>; \
+    template class rocsparseSolverBackend<T,3>; \
+    template class rocsparseSolverBackend<T,4>; \
+    template class rocsparseSolverBackend<T,5>; \
+    template class rocsparseSolverBackend<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
-template rocsparseSolverBackend<n>::rocsparseSolverBackend(                 \
-    int, int, double, unsigned int, unsigned int);
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
@ -31,16 +31,13 @@

 #include <hip/hip_version.h>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a rocsparse-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class rocsparseSolverBackend : public BdaSolver<block_size>
+template<class Scalar, unsigned int block_size>
+class rocsparseSolverBackend : public BdaSolver<Scalar,block_size>
 {
-    typedef BdaSolver<block_size> Base;
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -54,14 +51,13 @@ class rocsparseSolverBackend : public BdaSolver<block_size>
    using Base::initialized;

 private:
-
    double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU

    bool useJacMatrix = false;

    bool analysis_done = false;
-    std::shared_ptr<BlockedMatrix> mat = nullptr;                 // original matrix
-    std::shared_ptr<BlockedMatrix> jacMat = nullptr;              // matrix for preconditioner
+    std::shared_ptr<BlockedMatrix<Scalar>> mat{};                 // original matrix
+    std::shared_ptr<BlockedMatrix<Scalar>> jacMat{};              // matrix for preconditioner
    int nnzbs_prec = 0;    // number of nnz blocks in preconditioner matrix M

    rocsparse_direction dir = rocsparse_direction_row;
@ -77,31 +73,31 @@ private:

    rocsparse_int *d_Arows, *d_Mrows;
    rocsparse_int *d_Acols, *d_Mcols;
-    double *d_Avals, *d_Mvals;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
-    double *d_pw, *d_s, *d_t, *d_v;
+    Scalar *d_Avals, *d_Mvals;
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    Scalar *d_pw, *d_s, *d_t, *d_v;
    void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
    int  ver;
    char rev[64];

-
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
    /// \param[in] matrix     matrix A
    /// \param[in] jacMatrix  matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Copy linear system to GPU
    /// \param[in] b              input vector, contains N values
-    void copy_system_to_gpu(double *b);
+    void copy_system_to_gpu(Scalar* b);

    /// Update linear system to GPU
    /// \param[in] b              input vector, contains N values
-    void update_system_on_gpu(double *b);
+    void update_system_on_gpu(Scalar* b);

    /// Analyze sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
@ -114,16 +110,20 @@ private:
    /// Solve linear system
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions &wellContribs, BdaResult &res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);

 public:
-    /// Construct a openclSolver
-    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
-    /// \param[in] maxit                      maximum number of iterations for openclSolver
-    /// \param[in] tolerance                  required relative tolerance for openclSolver
+    /// Construct a rocsparseSolver
+    /// \param[in] linear_solver_verbosity    verbosity of rocsparseSolver
+    /// \param[in] maxit                      maximum number of iterations for rocsparseSolver
+    /// \param[in] tolerance                  required relative tolerance for rocsparseSolver
    /// \param[in] platformID                 the OpenCL platform to be used
    /// \param[in] deviceID                   the device to be used
-    rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
+    rocsparseSolverBackend(int linear_solver_verbosity,
+                           int maxit,
+                           Scalar tolerance,
+                           unsigned int platformID,
+                           unsigned int deviceID);

    /// For the CPR coarse solver
    // rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
@ -138,8 +138,11 @@ public:
    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
-        std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
+                              BdaResult& res) override;

    /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
    /// Data is already on the GPU
@ -147,13 +150,10 @@ public:

    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;

 }; // end class rocsparseSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif
-
-
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@ -56,17 +56,17 @@ namespace Opm

 #ifdef __HIP__
 /// HIP kernel to apply the standard wellcontributions
-__global__ void stdwell_apply(
-            const double *Cnnzs,
-            const double *Dnnzs,
-            const double *Bnnzs,
-            const unsigned *Ccols,
-            const unsigned *Bcols,
-            const double *x,
-            double *y,
-            const unsigned dim,
-            const unsigned dim_wells,
-            const unsigned *val_pointers)
+template<class Scalar>
+__global__ void stdwell_apply(const Scalar* Cnnzs,
+                              const Scalar* Dnnzs,
+                              const Scalar* Bnnzs,
+                              const unsigned* Ccols,
+                              const unsigned* Bcols,
+                              const Scalar* x,
+                              Scalar* y,
+                              const unsigned dim,
+                              const unsigned dim_wells,
+                              const unsigned *val_pointers)
 {
    unsigned wgId = blockIdx.x;
    unsigned wiId = threadIdx.x;
@ -76,16 +76,16 @@ __global__ void stdwell_apply(
    unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
    unsigned c = wiId % dim;
    unsigned r = (wiId/dim) % dim_wells;
-    double temp;
+    Scalar temp;

-    extern __shared__ double localSum[];
-    double *z1 = localSum + gridDim.x;
-    double *z2 = z1 + dim_wells;
+    extern __shared__ Scalar localSum[];
+    Scalar* z1 = localSum + gridDim.x;
+    Scalar* z2 = z1 + dim_wells;

    localSum[wiId] = 0;
-    if(wiId < numActiveWorkItems){
+    if (wiId < numActiveWorkItems) {
        unsigned b = wiId/valsPerBlock + val_pointers[wgId];
-        while(b < valSize + val_pointers[wgId]){
+        while (b < valSize + val_pointers[wgId]) {
            int colIdx = Bcols[b];
            localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
            b += numBlocksPerWarp;
@ -99,14 +99,14 @@ __global__ void stdwell_apply(
        //  6  7  8     18 19 20
        //  9 10 11     21 22 23
        // workitem i will hold the sum of workitems i and i + valsPerBlock
-        if(wiId < valsPerBlock){
+        if (wiId < valsPerBlock){
            for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
                localSum[wiId] += localSum[wiId + i*valsPerBlock];
 	    }
        }

-        if(c == 0 && wiId < valsPerBlock){
-            for(unsigned i = dim - 1; i > 0; --i){
+        if (c == 0 && wiId < valsPerBlock){
+            for (unsigned i = dim - 1; i > 0; --i) {
                localSum[wiId] += localSum[wiId + i];
            }
            z1[r] = localSum[wiId];
@ -117,7 +117,7 @@ __global__ void stdwell_apply(

    if(wiId < dim_wells){
        temp = 0.0;
-        for(unsigned i = 0; i < dim_wells; ++i){
+        for (unsigned i = 0; i < dim_wells; ++i) {
            temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
        }
        z2[wiId] = temp;
@ -125,10 +125,10 @@ __global__ void stdwell_apply(

    __syncthreads();

-    if(wiId < dim*valSize){
+    if (wiId < dim*valSize){
        temp = 0.0;
        unsigned bb = wiId/dim + val_pointers[wgId];
-        for (unsigned j = 0; j < dim_wells; ++j){
+        for (unsigned j = 0; j < dim_wells; ++j) {
            temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
        }

@ -138,17 +138,26 @@ __global__ void stdwell_apply(
 }
 #endif

-
-void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
-                                                [[maybe_unused]] double *d_y){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+apply_stdwells([[maybe_unused]] Scalar* d_x,
+               [[maybe_unused]] Scalar* d_y)
+{
 #ifdef __HIP__
    unsigned gridDim = num_std_wells;
    unsigned blockDim = 64;
-    unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
+    unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(Scalar); // shared memory for localSum, z1 and z2
    // dim3(N) will create a vector {N, 1, 1}
-    stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
-        d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
-        d_x, d_y, dim, dim_wells, d_val_pointers_hip
+    stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(d_Cnnzs_hip,
+                                                                              d_Dnnzs_hip,
+                                                                              d_Bnnzs_hip,
+                                                                              d_Ccols_hip,
+                                                                              d_Bcols_hip,
+                                                                              d_x,
+                                                                              d_y,
+                                                                              dim,
+                                                                              dim_wells,
+                                                                              d_val_pointers_hip
    );
    HIP_CHECK(hipStreamSynchronize(stream));
 #else
@ -156,67 +165,89 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
 #endif
 }

-void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+apply_mswells(Scalar* d_x, Scalar* d_y)
+{
    if (h_x.empty()) {
-        h_x.resize(N);
-        h_y.resize(N);
+        h_x.resize(this->N);
+        h_y.resize(this->N);
    }

-    HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
-    HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
    HIP_CHECK(hipStreamSynchronize(stream));

    // actually apply MultisegmentWells
-    for (auto& well : multisegments) {
+    for (auto& well : this->multisegments) {
        well->apply(h_x.data(), h_y.data());
    }

    // copy vector y from CPU to GPU
-    HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, stream));
    HIP_CHECK(hipStreamSynchronize(stream));
 }

-void WellContributionsRocsparse::apply(double *d_x, double *d_y){
-    if(num_std_wells > 0){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+apply(Scalar* d_x, Scalar* d_y)
+{
+    if (this->num_std_wells > 0) {
        apply_stdwells(d_x, d_y);
    }

-    if(num_ms_wells > 0){
+    if (this->num_ms_wells > 0) {
        apply_mswells(d_x, d_y);
    }
 }

-void WellContributionsRocsparse::setStream(hipStream_t stream_){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::setStream(hipStream_t stream_)
+{
    stream = stream_;
 }

-void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
-                                              int* colIndices,
-                                              double* values,
-                                              unsigned int val_size)
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+APIaddMatrix(MatrixType type,
+             int* colIndices,
+             Scalar* values,
+             unsigned int val_size)
 {
-    if (!allocated) {
+    if (!this->allocated) {
        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

    switch (type) {
    case MatrixType::C:
-        HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
+                                 values, sizeof(d_Cnnzs_hip) * val_size * this->dim * this->dim_wells,
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + this->num_blocks_so_far, colIndices,
+                                 sizeof(d_Ccols_hip) * val_size,
+                                 hipMemcpyHostToDevice, stream));
        break;

    case MatrixType::D:
-        HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
+                                 values, sizeof(d_Dnnzs_hip) * this->dim_wells * this->dim_wells,
+                                 hipMemcpyHostToDevice, stream));
        break;

    case MatrixType::B:
-        HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
+                                 values, sizeof(d_Bnnzs_hip) * val_size * this->dim * this->dim_wells,
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + this->num_blocks_so_far, colIndices,
+                                 sizeof(d_Bcols_hip) * val_size,
+                                 hipMemcpyHostToDevice, stream));

-        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-        if (num_std_wells_so_far == num_std_wells - 1) {
-            val_pointers[num_std_wells] = num_blocks;
-            HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
+        this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
+        if (this->num_std_wells_so_far == this->num_std_wells - 1) {
+            this->val_pointers[this->num_std_wells] = this->num_blocks;
+            HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, this->val_pointers.data(),
+                                     sizeof(d_val_pointers_hip) * (this->num_std_wells + 1),
+                                     hipMemcpyHostToDevice, stream));
        }
        break;

@ -226,14 +257,21 @@ void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
    HIP_CHECK(hipStreamSynchronize(stream));
 }

-void WellContributionsRocsparse::APIalloc()
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::APIalloc()
 {
-    HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
-    HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
-    HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
-    HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
-    HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
-    HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
+    HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip,
+                        sizeof(d_Cnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip,
+                        sizeof(d_Dnnzs_hip) * this->num_std_wells * this->dim_wells * this->dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip,
+                        sizeof(d_Bnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * this->num_blocks));
+    HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * this->num_blocks));
+    HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip,
+                        sizeof(d_val_pointers_hip) * (this->num_std_wells + 1)));
 }

-} //namespace Opm
+template class WellContributionsRocsparse<double>;
+
+} // namespace Opm
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
@ -26,33 +26,35 @@

 #include <vector>

+namespace Opm {

-namespace Opm
-{
-
-class WellContributionsRocsparse : public WellContributions
+template<class Scalar>
+class WellContributionsRocsparse : public WellContributions<Scalar>
 {
 private:
    hipStream_t stream;

 public:
-    void apply_stdwells(double *d_x, double *d_y);
-    void apply_mswells(double *d_x, double *d_y);
-    void apply(double *d_x, double *d_y);
+    void apply_stdwells(Scalar* d_x, Scalar* d_y);
+    void apply_mswells(Scalar* d_x, Scalar* d_y);
+    void apply(Scalar* d_x, Scalar* d_y);
    void setStream(hipStream_t stream);

 protected:
    /// Allocate memory for the StandardWells
    void APIalloc() override;

-    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+    using MatrixType = typename WellContributions<Scalar>::MatrixType;

-    double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
+    void APIaddMatrix(MatrixType type, int* colIndices,
+                      Scalar* values, unsigned int val_size) override;
+
+    Scalar *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
    unsigned *d_Ccols_hip, *d_Bcols_hip;
    unsigned *d_val_pointers_hip;

-    std::vector<double> h_x;
-    std::vector<double> h_y;
+    std::vector<Scalar> h_x;
+    std::vector<Scalar> h_y;
 };

 } //namespace Opm
--- a/opm/simulators/wells/BlackoilWellModel.hpp
+++ b/opm/simulators/wells/BlackoilWellModel.hpp
@ -90,7 +90,7 @@ struct EnableTerminalOutput {
 namespace Opm {

 #if COMPILE_BDA_BRIDGE
-class WellContributions;
+template<class Scalar> class WellContributions;
 #endif

        /// Class for handling the blackoil well model.
@ -287,7 +287,7 @@ class WellContributions;

 #if COMPILE_BDA_BRIDGE
            // accumulate the contributions of all Wells in the WellContributions object
-            void getWellContributions(WellContributions& x) const;
+            void getWellContributions(WellContributions<Scalar>& x) const;
 #endif

            // apply well model with scaling of alpha
--- a/opm/simulators/wells/BlackoilWellModel_impl.hpp
+++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp
@ -1568,7 +1568,7 @@ namespace Opm {
    template<typename TypeTag>
    void
    BlackoilWellModel<TypeTag>::
-    getWellContributions(WellContributions& wellContribs) const
+    getWellContributions(WellContributions<Scalar>& wellContribs) const
    {
        // prepare for StandardWells
        wellContribs.setBlockSize(StandardWell<TypeTag>::Indices::numEq, StandardWell<TypeTag>::numStaticWellEq);
--- a/opm/simulators/wells/MultisegmentWellEquations.cpp
+++ b/opm/simulators/wells/MultisegmentWellEquations.cpp
@ -202,7 +202,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
 #if COMPILE_BDA_BRIDGE
 template<class Scalar, int numWellEq, int numEq>
 void MultisegmentWellEquations<Scalar,numWellEq,numEq>::
-extract(WellContributions& wellContribs) const
+extract(WellContributions<Scalar>& wellContribs) const
 {
    unsigned int Mb = duneB_.N();       // number of blockrows in duneB_, duneC_ and duneD_
    unsigned int BnumBlocks = duneB_.nonzeroes();
--- a/opm/simulators/wells/MultisegmentWellEquations.hpp
+++ b/opm/simulators/wells/MultisegmentWellEquations.hpp
@ -39,7 +39,7 @@ namespace Opm
 template<class Scalar, int numWellEq, int numEq> class MultisegmentWellEquationAccess;
 template<class Scalar> class MultisegmentWellGeneric;
 #if COMPILE_BDA_BRIDGE
-class WellContributions;
+template<class Scalar> class WellContributions;
 #endif
 template<class Scalar> class WellInterfaceGeneric;
 template<class Scalar> class WellState;
@ -105,7 +105,7 @@ public:

 #if COMPILE_BDA_BRIDGE
    //! \brief Add the matrices of this well to the WellContributions object.
-    void extract(WellContributions& wellContribs) const;
+    void extract(WellContributions<Scalar>& wellContribs) const;
 #endif

    //! \brief Add the matrices of this well to the sparse matrix adapter.
--- a/opm/simulators/wells/StandardWellEquations.cpp
+++ b/opm/simulators/wells/StandardWellEquations.cpp
@ -198,7 +198,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
 template<class Scalar, int numEq>
 void StandardWellEquations<Scalar,numEq>::
 extract(const int numStaticWellEq,
-        WellContributions& wellContribs) const
+        WellContributions<Scalar>& wellContribs) const
 {
    std::vector<int> colIndices;
    std::vector<Scalar> nnzValues;
@ -216,7 +216,7 @@ extract(const int numStaticWellEq,
            }
        }
    }
-    wellContribs.addMatrix(WellContributions::MatrixType::C,
+    wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::C,
                           colIndices.data(), nnzValues.data(), duneC_.nonzeroes());

    // invDuneD
@ -229,7 +229,7 @@ extract(const int numStaticWellEq,
            nnzValues.emplace_back(invDuneD_[0][0][i][j]);
        }
    }
-    wellContribs.addMatrix(WellContributions::MatrixType::D,
+    wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::D,
                           colIndices.data(), nnzValues.data(), 1);

    // duneB
@ -245,7 +245,7 @@ extract(const int numStaticWellEq,
            }
        }
    }
-    wellContribs.addMatrix(WellContributions::MatrixType::B,
+    wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::B,
                           colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
 }
 #endif
--- a/opm/simulators/wells/StandardWellEquations.hpp
+++ b/opm/simulators/wells/StandardWellEquations.hpp
@ -37,7 +37,7 @@ namespace Opm
 template<class Scalar> class ParallelWellInfo;
 template<class Scalar, int numEq> class StandardWellEquationAccess;
 #if COMPILE_BDA_BRIDGE
-class WellContributions;
+template<class Scalar> class WellContributions;
 #endif
 template<class Scalar> class WellInterfaceGeneric;
 template<class Scalar> class WellState;
@ -102,7 +102,7 @@ public:
 #if COMPILE_BDA_BRIDGE
    //! \brief Add the matrices of this well to the WellContributions object.
    void extract(const int numStaticWellEq,
-                 WellContributions& wellContribs) const;
+                 WellContributions<Scalar>& wellContribs) const;
 #endif

    //! \brief Add the matrices of this well to the sparse matrix adapter.
--- a/opm/simulators/wells/StandardWellEval.hpp
+++ b/opm/simulators/wells/StandardWellEval.hpp
@ -38,7 +38,7 @@ class ConvergenceReport;
 class DeferredLogger;
 class Schedule;
 class SummaryState;
-class WellContributions;
+template<class Scalar> class WellContributions;
 template<class FluidSystem, class Indices> class WellInterfaceIndices;
 template<class Scalar> class WellState;

--- a/opm/simulators/wells/WellBhpThpCalculator.cpp
+++ b/opm/simulators/wells/WellBhpThpCalculator.cpp
@ -272,7 +272,7 @@ computeBhpAtThpLimitProd(const std::function<std::vector<Scalar>(const Scalar)>&
                                "find bhp-point where production becomes non-zero for well " + well_.name());
        return std::nullopt;
    }
-    const std::array<Scalar, 2> range {controls.bhp_limit, *bhp_max};
+    const std::array<Scalar, 2> range {static_cast<Scalar>(controls.bhp_limit), *bhp_max};
    return this->computeBhpAtThpLimit(frates, fbhp, range, deferred_logger);
 }

@ -518,9 +518,9 @@ computeBhpAtThpLimitInjImpl(const std::function<std::vector<Scalar>(const Scalar

    // Get the flo samples, add extra samples at low rates and bhp
    // limit point if necessary.
-    std::vector<Scalar> flo_samples = table.getFloAxis();
+    std::vector<double> flo_samples = table.getFloAxis();
    if (flo_samples[0] > 0.0) {
-        const Scalar f0 = flo_samples[0];
+        const double f0 = flo_samples[0];
        flo_samples.insert(flo_samples.begin(), { f0/20.0, f0/10.0, f0/5.0, f0/2.0 });
    }
    const Scalar flo_bhp_limit = flo(frates(controls.bhp_limit));
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@ -123,7 +123,7 @@ testCusparseSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create("cusparse", false);
+    auto wellContribs = Opm::WellContributions<double>::create("cusparse", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -138,7 +138,7 @@ testCusparseSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Mat
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create("cusparse", false);
+    auto wellContribs = Opm::WellContributions<double>::create("cusparse", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@ -120,7 +120,7 @@ testOpenclSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>&
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create("opencl", false);
+    auto wellContribs = Opm::WellContributions<double>::create("opencl", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -135,7 +135,7 @@ testOpenclSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matri
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create("opencl", false);
+    auto wellContribs = Opm::WellContributions<double>::create("opencl", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
--- a/tests/test_rocalutionSolver.cpp
+++ b/tests/test_rocalutionSolver.cpp
@ -96,7 +96,7 @@ testRocalutionSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix,
    Dune::InverseOperatorResult result;

    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create(accelerator_mode, true);
+    auto wellContribs = Opm::WellContributions<double>::create(accelerator_mode, true);
    std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
--- a/tests/test_rocsparseSolver.cpp
+++ b/tests/test_rocsparseSolver.cpp
@ -127,7 +127,7 @@ testRocsparseSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create("rocsparse", true);
+    auto wellContribs = Opm::WellContributions<double>::create("rocsparse", true);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -142,7 +142,7 @@ testRocsparseSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>,
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
-    auto wellContribs = Opm::WellContributions::create("rocsparse", true);
+    auto wellContribs = Opm::WellContributions<double>::create("rocsparse", true);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different