diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index 366e3e486..9d69b23a9 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
         use_gpu = true;
-        backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
+        using CU = Accelerator::cusparseSolverBackend<double,block_size>;
+        backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
 #else
         OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
index d4d579bcf..bf6f5090a 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -44,23 +44,18 @@
 extern std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP
 
-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {
 
-using Opm::OpmLog;
 using Dune::Timer;
 
 const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
 const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
 const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
 
-
-template <unsigned int block_size>
-cusparseSolverBackend<block_size>::
+template<class Scalar, unsigned int block_size>
+cusparseSolverBackend<Scalar, block_size>::
 cusparseSolverBackend(int verbosity_, int maxit_,
-                      double tolerance_, unsigned int deviceID_)
+                      Scalar tolerance_, unsigned int deviceID_)
     : Base(verbosity_, maxit_, tolerance_, deviceID_)
 {
     // initialize CUDA device, stream and libraries
@@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
     cudaGetDeviceProperties(&props, deviceID);
     cudaCheckLastError("Could not get device properties");
     std::ostringstream out;
-    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+    out << "Name GPU: " << props.name << ", Compute Capability: "
+        << props.major << "." << props.minor;
     OpmLog::info(out.str());
 
     cudaStreamCreate(&stream);
@@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
     cudaCheckLastError("Could not set stream to cusparse");
 }
 
-template <unsigned int block_size>
-cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
+template<class Scalar, unsigned int block_size>
+cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
+{
     finalize();
 }
 
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
     Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
     int n = N;
-    double rho = 1.0, rhop;
-    double alpha, nalpha, beta;
-    double omega, nomega, tmp1, tmp2;
-    double norm, norm_0;
-    double zero = 0.0;
-    double one  = 1.0;
-    double mone = -1.0;
+    Scalar rho = 1.0, rhop;
+    Scalar alpha, nalpha, beta;
+    Scalar omega, nomega, tmp1, tmp2;
+    Scalar norm, norm_0;
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;
+    Scalar mone = -1.0;
     float it;
 
     if (wellContribs.getNumWells() > 0) {
-        static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
+        static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
     }
 
     cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
 
         // apply wellContributions
         if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
         }
 
         cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
 
         // apply wellContributions
         if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
         }
 
         cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
 
         cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
 
-
         if (norm < tolerance * norm_0) {
             break;
         }
@@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
 
     if (verbosity > 0) {
         std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        out << "=== converged: " << res.converged << ", conv_rate: "
+            << res.conv_rate << ", time: " << res.elapsed
+            << ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
         OpmLog::info(out.str());
     }
 }
 
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
     this->Nb = matrix->Nb;
     this->N = Nb * block_size;
@@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
     }
 
     std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
+    out << "Initializing GPU, matrix size: " << Nb
+        << " blockrows, nnz: " << nnzb << " blocks\n";
     if (useJacMatrix) {
         out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
     }
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "Maxit: " << maxit << std::scientific
+        << ", tolerance: " << tolerance << "\n";
     OpmLog::info(out.str());
 
-    cudaMalloc((void**)&d_x, sizeof(double) * N);
-    cudaMalloc((void**)&d_b, sizeof(double) * N);
-    cudaMalloc((void**)&d_r, sizeof(double) * N);
-    cudaMalloc((void**)&d_rw, sizeof(double) * N);
-    cudaMalloc((void**)&d_p, sizeof(double) * N);
-    cudaMalloc((void**)&d_pw, sizeof(double) * N);
-    cudaMalloc((void**)&d_s, sizeof(double) * N);
-    cudaMalloc((void**)&d_t, sizeof(double) * N);
-    cudaMalloc((void**)&d_v, sizeof(double) * N);
-    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
     cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
     cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
     if (useJacMatrix) {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
         cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
         cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
     } else {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
         d_mCols = d_bCols;
         d_mRows = d_bRows;
     }
     cudaCheckLastError("Could not allocate enough memory on GPU");
 
 #if COPY_ROW_BY_ROW
-    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+    cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
     cudaCheckLastError("Could not allocate pinned memory");
 #endif
 
     initialized = true;
 } // end initialize()
 
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::finalize() {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::finalize()
+{
     if (initialized) {
         cudaFree(d_x);
         cudaFree(d_b);
@@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
     }
 } // end finalize()
 
-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                   double *b,
-                   std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                   Scalar* b,
+                   std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
     Timer t;
 
-    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
 
 #if COPY_ROW_BY_ROW
     int sum = 0;
     for (int i = 0; i < Nb; ++i) {
         int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
         sum += size_row * block_size * block_size;
     }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
     if (useJacMatrix) {
 #if HAVE_OPENMP
 	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
+                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
+                        cudaMemcpyHostToDevice, stream);
     } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals,
+                        nnz  * sizeof(Scalar),
+                        cudaMemcpyDeviceToDevice, stream);
     }
 #endif
 
     if (useJacMatrix) {
-        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
+                        cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
+                        cudaMemcpyHostToDevice, stream);
     }
 
     if (verbosity >= 3) {
@@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
     }
 } // end copy_system_to_gpu()
 
-
 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                     double *b,
-                     std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                     Scalar* b,
+                     std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
     Timer t;
 
-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
     
 #if COPY_ROW_BY_ROW
     int sum = 0;
     for (int i = 0; i < Nb; ++i) {
         int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
         sum += size_row * block_size * block_size;
     }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
     if (useJacMatrix) {
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+        if (omp_get_max_threads() > 1) {
+           copyThread->join();
+        }
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, 
+                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
+                        cudaMemcpyHostToDevice, stream);
     } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(Scalar),
+                        cudaMemcpyDeviceToDevice, stream);
     }
 #endif
 
@@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
     }
 } // end update_system_on_gpu()
 
-
-template <unsigned int block_size>
-bool cusparseSolverBackend<block_size>::analyse_matrix() {
-
+template<class Scalar, unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
+{
     int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
     Timer t;
 
@@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
     return true;
 } // end analyse_matrix()
 
-template <unsigned int block_size>
-bool cusparseSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
+{
     Timer t;
 
     cusparseDbsrilu02(cusparseHandle, order, \
@@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
     return true;
 } // end create_preconditioner()
 
-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-solve_system(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
     // actually solve
     gpu_pbicgstab(wellContribs, res);
@@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
     cudaCheckLastError("Something went wrong during the GPU solve");
 } // end solve_system()
 
-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
+{
     Timer t;
 
-    cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+    cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
     cudaStreamSynchronize(stream);
 
     if (verbosity > 2) {
@@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
     }
 } // end get_result()
 
-template <unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::
-solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-             double *b,
-             std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-             WellContributions<double>& wellContribs,
+template<class Scalar, unsigned int block_size>
+SolverStatus cusparseSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
              BdaResult& res)
 {
     if (initialized == false) {
@@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
 
+#define INSTANTIATE_TYPE(T)                    \
+    template class cusparseSolverBackend<T,1>; \
+    template class cusparseSolverBackend<T,2>; \
+    template class cusparseSolverBackend<T,3>; \
+    template class cusparseSolverBackend<T,4>; \
+    template class cusparseSolverBackend<T,5>; \
+    template class cusparseSolverBackend<T,6>;
 
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                       \
-template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int);  \
+INSTANTIATE_TYPE(double)
 
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
index 748955192..48c9c9343 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@@ -28,16 +28,13 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 
-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {
 
 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class cusparseSolverBackend : public BdaSolver<double,block_size> {
-
-    using Base = BdaSolver<double,block_size>;
+template<class Scalar, unsigned int block_size>
+class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
+{
+    using Base = BdaSolver<Scalar,block_size>;
 
     using Base::N;
     using Base::Nb;
@@ -57,13 +54,13 @@ private:
     bsrilu02Info_t info_M;
     bsrsv2Info_t info_L, info_U;
     // b: bsr matrix, m: preconditioner
-    double *d_bVals, *d_mVals;
+    Scalar *d_bVals, *d_mVals;
     int *d_bCols, *d_mCols;
     int *d_bRows, *d_mRows;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
-    double *d_pw, *d_s, *d_t, *d_v;
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    Scalar *d_pw, *d_s, *d_t, *d_v;
     void *d_buffer;
-    double *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
+    Scalar *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
 
     bool analysis_done = false;
 
@@ -76,13 +73,13 @@ private:
     /// Solve linear system using ilu0-bicgstab
     /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
     /// \param[in] matrix         matrix for spmv
     /// \param[in] jacMatrix      matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
 
     /// Clean memory
     void finalize();
@@ -92,18 +89,18 @@ private:
     /// \param[in] matrix         matrix for spmv
     /// \param[in] b              input vector, contains N values
     /// \param[in] jacMatrix      matrix for preconditioner
-    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                            double *b,
-                            std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                            Scalar* b,
+                            std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
 
     /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
     /// also copy matrix for preconditioner if needed
     /// \param[in] matrix         matrix for spmv
     /// \param[in] b              input vector, contains N values
     /// \param[in] jacMatrix      matrix for preconditioner
-    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                              double *b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
 
     /// Analyse sparsity pattern to extract parallelism
     /// \return true iff analysis was successful
@@ -116,17 +113,16 @@ private:
     /// Solve linear system
     /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions<double>& wellContribs, BdaResult &res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
 
 public:
-
-
     /// Construct a cusparseSolver
     /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
     /// \param[in] maxit                      maximum number of iterations for cusparseSolver
     /// \param[in] tolerance                  required relative tolerance for cusparseSolver
     /// \param[in] deviceID                   the device to be used
-    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
+    cusparseSolverBackend(int linear_solver_verbosity, int maxit,
+                          Scalar tolerance, unsigned int deviceID);
 
     /// Destroy a cusparseSolver, and free memory
     ~cusparseSolverBackend();
@@ -138,20 +134,19 @@ public:
     /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
     /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-                              double *b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-                              WellContributions<double>& wellContribs,
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
                               BdaResult& res) override;
     
     /// Get resulting vector x after linear solve, also includes post processing if necessary
     /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;
 
 }; // end class cusparseSolverBackend
 
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
 
 #endif