From 3eed028978b2962c98245697ecaee6dea2728086 Mon Sep 17 00:00:00 2001
From: Arne Morten Kvarving <arne.morten.kvarving@sintef.no>
Date: Tue, 16 Apr 2024 19:11:04 +0200
Subject: [PATCH] rocsparseSolverBackend: template Scalar type

---
 opm/simulators/linalg/bda/BdaBridge.cpp       |   4 +-
 .../linalg/bda/rocsparseSolverBackend.cpp     | 242 ++++++++++--------
 .../linalg/bda/rocsparseSolverBackend.hpp     |  63 +++--
 3 files changed, 165 insertions(+), 144 deletions(-)
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index 5492e504e..ae24fc807 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -112,7 +112,9 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     } else if (accelerator_mode.compare("rocsparse") == 0) {
 #if HAVE_ROCSPARSE
         use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        backend.reset(new Opm::Accelerator::rocsparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
+        using ROCS = Accelerator::rocsparseSolverBackend<double,block_size>;
+        backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
+                                         tolerance, platformID, deviceID);
 #else
         OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
 #endif
diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
index 3c688a68b..9860f5728 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -93,24 +93,20 @@
 extern std::shared_ptr<std::thread> copyThread;
 #endif //HAVE_OPENMP
 
-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {
 
-using Opm::OpmLog;
 using Dune::Timer;
 
-template <unsigned int block_size>
-rocsparseSolverBackend<block_size>::
-rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_,
+template<class Scalar, unsigned int block_size>
+rocsparseSolverBackend<Scalar,block_size>::
+rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
                        unsigned int platformID_, unsigned int deviceID_)
     : Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
 {
     int numDevices = 0;
     HIP_CHECK(hipGetDeviceCount(&numDevices));
     if (static_cast<int>(deviceID) >= numDevices) {
-        OPM_THROW(std::runtime_error, "Error chosen too high HIP device ID");
+        OPM_THROW(std::runtime_error, "Invalid HIP device ID");
     }
     HIP_CHECK(hipSetDevice(deviceID));
 
@@ -130,46 +126,45 @@ rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_,
     ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
 }
 
-
-template <unsigned int block_size>
-rocsparseSolverBackend<block_size>::~rocsparseSolverBackend() {
+template<class Scalar, unsigned int block_size>
+rocsparseSolverBackend<Scalar,block_size>::~rocsparseSolverBackend()
+{
     hipError_t hipstatus = hipStreamSynchronize(stream);
-    if(hipstatus != hipSuccess){
+    if (hipstatus != hipSuccess) {
         OpmLog::error("Could not synchronize with hipStream");
     }
     hipstatus = hipStreamDestroy(stream);
-    if(hipstatus != hipSuccess){
+    if (hipstatus != hipSuccess) {
         OpmLog::error("Could not destroy hipStream");
     }
     rocsparse_status status1 = rocsparse_destroy_handle(handle);
-    if(status1 != rocsparse_status_success){
+    if (status1 != rocsparse_status_success) {
         OpmLog::error("Could not destroy rocsparse handle");
     }
     rocblas_status status2 = rocblas_destroy_handle(blas_handle);
-    if(status2 != rocblas_status_success){
+    if (status2 != rocblas_status_success) {
         OpmLog::error("Could not destroy rocblas handle");
     }
 }
 
-
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::
-gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
               BdaResult& res)
 {
     float it = 0.5;
-    double rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
-    double norm, norm_0;
-    double zero = 0.0;
-    double one  = 1.0;
-    double mone = -1.0;
+    Scalar rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
+    Scalar norm, norm_0;
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;
+    Scalar mone = -1.0;
 
     Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
 
     // set stream here, the WellContributions object is destroyed every linear solve
     // the number of wells can change every linear solve
     if (wellContribs.getNumWells() > 0) {
-        static_cast<WellContributionsRocsparse<double>&>(wellContribs).setStream(stream);
+        static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).setStream(stream);
     }
 
 // HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
@@ -259,7 +254,7 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
 
         // apply wellContributions
         if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_pw, d_v);
         }
         if (verbosity >= 3) {
             HIP_CHECK(hipStreamSynchronize(stream));
@@ -325,7 +320,7 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
 
         // apply wellContributions
         if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_s, d_t);
         }
         if (verbosity >= 3) {
             HIP_CHECK(hipStreamSynchronize(stream));
@@ -365,8 +360,11 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
 
     if (verbosity >= 1) {
         std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        out << "=== converged: " << res.converged
+            << ", conv_rate: " << res.conv_rate
+            << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / it
+            << ", iterations: " << it;
         OpmLog::info(out.str());
     }
     if (verbosity >= 3) {
@@ -380,10 +378,10 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
     }
 }
 
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::
-initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
     this->Nb = matrix->Nb;
     this->N = Nb * block_size;
@@ -397,12 +395,14 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
     }
 
     std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
+    out << "Initializing GPU, matrix size: "
+        << Nb << " blockrows, nnzb: " << nnzb << "\n";
     if (useJacMatrix) {
         out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
     }
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
-    out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
+    out << "Maxit: " << maxit
+        << std::scientific << ", tolerance: " << tolerance << "\n"
+        << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
     OpmLog::info(out.str());
     out.str("");
     out.clear();
@@ -410,26 +410,26 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
     mat = matrix;
     jacMat = jacMatrix;
 
-    HIP_CHECK(hipMalloc((void**)&d_r, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_p, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_s, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_v, sizeof(double) * N));
+    HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_p, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_s, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_v, sizeof(Scalar) * N));
 
     HIP_CHECK(hipMalloc((void**)&d_Arows, sizeof(rocsparse_int) * (Nb + 1)));
     HIP_CHECK(hipMalloc((void**)&d_Acols, sizeof(rocsparse_int) * nnzb));
-    HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(double) * nnz));
-    HIP_CHECK(hipMalloc((void**)&d_x, sizeof(double) * N));
-    HIP_CHECK(hipMalloc((void**)&d_b, sizeof(double) * N));
+    HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(Scalar) * nnz));
+    HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
+    HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));
 
     if (useJacMatrix) {
         HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
         HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
     } else { // preconditioner matrix is same
-        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
+        HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
         d_Mcols = d_Acols;
         d_Mrows = d_Arows;
     }
@@ -437,26 +437,43 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
     initialized = true;
 } // end initialize()
 
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+copy_system_to_gpu(Scalar *b)
+{
     Timer t;
 
-    HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
-    HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers,
+                             sizeof(rocsparse_int) * (Nb + 1), 
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, 
+                             sizeof(rocsparse_int) * nnzb,
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues,
+                             sizeof(Scalar) * nnz,
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
+    HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
+                             hipMemcpyHostToDevice, stream));
     
     if (useJacMatrix) {
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+        if (omp_get_max_threads() > 1) {
+           copyThread->join();
+        }
 #endif
-        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
+                                 sizeof(rocsparse_int) * (Nb + 1),
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
+                                 sizeof(rocsparse_int) * nnzbs_prec,
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
+                                 sizeof(Scalar) * nnzbs_prec * block_size * block_size,
+                                 hipMemcpyHostToDevice, stream));
     } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
+                                 sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
     }
 
     if (verbosity >= 3) {
@@ -466,29 +483,36 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
         std::ostringstream out;
         out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
         out << "---rocsparseSolver::cum copy: " << c_copy << " s";
-	OpmLog::info(out.str());
+        OpmLog::info(out.str());
     }
 } // end copy_system_to_gpu()
 
 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+update_system_on_gpu(Scalar* b)
+{
     Timer t;
 
-    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
-    HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
-    HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(Scalar) * nnz,
+                             hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
+    HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
+                             hipMemcpyHostToDevice, stream));
     
     if (useJacMatrix) {
 #if HAVE_OPENMP
-	if (omp_get_max_threads() > 1)
-	    copyThread->join();
+        if (omp_get_max_threads() > 1) {
+            copyThread->join();
+        }
 #endif
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
+                                 sizeof(Scalar) * nnzbs_prec * block_size * block_size,
+                                 hipMemcpyHostToDevice, stream));
     } else {
-        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
+                                 sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
     }
-    
     if (verbosity >= 3) {
         HIP_CHECK(hipStreamSynchronize(stream));
 
@@ -500,8 +524,10 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
     }
 } // end update_system_on_gpu()
 
-template <unsigned int block_size>
-bool rocsparseSolverBackend<block_size>::analyze_matrix() {
+template<class Scalar, unsigned int block_size>
+bool rocsparseSolverBackend<Scalar,block_size>::
+analyze_matrix()
+{
     std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
     Timer t;
 
@@ -530,7 +556,8 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
     ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
                                descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
 
-    d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+    d_bufferSize = std::max(d_bufferSize_M,
+                            std::max(d_bufferSize_L, d_bufferSize_U));
 
     HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
 
@@ -578,9 +605,10 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
     return true;
 } // end analyze_matrix()
 
-
-template <unsigned int block_size>
-bool rocsparseSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool rocsparseSolverBackend<Scalar,block_size>::
+create_preconditioner()
+{
     Timer t;
 
     bool result = true;
@@ -605,10 +633,9 @@ bool rocsparseSolverBackend<block_size>::create_preconditioner() {
     return result;
 } // end create_preconditioner()
 
-
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::
-solve_system(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
     Timer t;
 
@@ -621,17 +648,18 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
         out << "rocsparseSolver::solve_system(): " << t.stop() << " s";
         OpmLog::info(out.str());
     }
-
 } // end solve_system()
 
-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void rocsparseSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void rocsparseSolverBackend<Scalar,block_size>::
+get_result(Scalar* x)
+{
     Timer t;
 
-    HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(Scalar) * N,
+                             hipMemcpyDeviceToHost, stream));
     HIP_CHECK(hipStreamSynchronize(stream)); // always wait, caller might want to use x immediately
 
     if (verbosity >= 3) {
@@ -641,14 +669,13 @@ void rocsparseSolverBackend<block_size>::get_result(double *x) {
     }
 } // end get_result()
 
-
-template <unsigned int block_size>
-SolverStatus rocsparseSolverBackend<block_size>::
-    solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-                 double *b,
-                 std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-                 WellContributions<double>& wellContribs,
-                 BdaResult& res)
+template<class Scalar, unsigned int block_size>
+SolverStatus rocsparseSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
+             BdaResult& res)
 {
     if (initialized == false) {
         initialize(matrix, jacMatrix);
@@ -672,19 +699,14 @@ SolverStatus rocsparseSolverBackend<block_size>::
     return SolverStatus::BDA_SOLVER_SUCCESS;
 }
 
+#define INSTANTIATE_TYPE(T)                     \
+    template class rocsparseSolverBackend<T,1>; \
+    template class rocsparseSolverBackend<T,2>; \
+    template class rocsparseSolverBackend<T,3>; \
+    template class rocsparseSolverBackend<T,4>; \
+    template class rocsparseSolverBackend<T,5>; \
+    template class rocsparseSolverBackend<T,6>;
 
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
-template rocsparseSolverBackend<n>::rocsparseSolverBackend(                 \
-    int, int, double, unsigned int, unsigned int);
+INSTANTIATE_TYPE(double)
 
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
index b3e41ed44..e2d179b9e 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
@@ -31,16 +31,13 @@
 
 #include <hip/hip_version.h>
 
-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {
 
 /// This class implements a rocsparse-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class rocsparseSolverBackend : public BdaSolver<double,block_size>
+template<class Scalar, unsigned int block_size>
+class rocsparseSolverBackend : public BdaSolver<Scalar,block_size>
 {
-    using Base = BdaSolver<double,block_size>;
+    using Base = BdaSolver<Scalar,block_size>;
 
     using Base::N;
     using Base::Nb;
@@ -59,8 +56,8 @@ private:
     bool useJacMatrix = false;
 
     bool analysis_done = false;
-    std::shared_ptr<BlockedMatrix<double>> mat{};                 // original matrix
-    std::shared_ptr<BlockedMatrix<double>> jacMat{};              // matrix for preconditioner
+    std::shared_ptr<BlockedMatrix<Scalar>> mat{};                 // original matrix
+    std::shared_ptr<BlockedMatrix<Scalar>> jacMat{};              // matrix for preconditioner
     int nnzbs_prec = 0;    // number of nnz blocks in preconditioner matrix M
 
     rocsparse_direction dir = rocsparse_direction_row;
@@ -76,32 +73,31 @@ private:
 
     rocsparse_int *d_Arows, *d_Mrows;
     rocsparse_int *d_Acols, *d_Mcols;
-    double *d_Avals, *d_Mvals;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
-    double *d_pw, *d_s, *d_t, *d_v;
+    Scalar *d_Avals, *d_Mvals;
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    Scalar *d_pw, *d_s, *d_t, *d_v;
     void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
     int  ver;
     char rev[64];
 
-
     /// Solve linear system using ilu0-bicgstab
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
 
     /// Initialize GPU and allocate memory
     /// \param[in] matrix     matrix A
     /// \param[in] jacMatrix  matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
 
     /// Copy linear system to GPU
     /// \param[in] b              input vector, contains N values
-    void copy_system_to_gpu(double *b);
+    void copy_system_to_gpu(Scalar* b);
 
     /// Update linear system to GPU
     /// \param[in] b              input vector, contains N values
-    void update_system_on_gpu(double *b);
+    void update_system_on_gpu(Scalar* b);
 
     /// Analyze sparsity pattern to extract parallelism
     /// \return true iff analysis was successful
@@ -114,16 +110,20 @@ private:
     /// Solve linear system
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions<double>& wellContribs, BdaResult& res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
 
 public:
-    /// Construct a openclSolver
-    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
-    /// \param[in] maxit                      maximum number of iterations for openclSolver
-    /// \param[in] tolerance                  required relative tolerance for openclSolver
+    /// Construct a rocsparseSolver
+    /// \param[in] linear_solver_verbosity    verbosity of rocsparseSolver
+    /// \param[in] maxit                      maximum number of iterations for rocsparseSolver
+    /// \param[in] tolerance                  required relative tolerance for rocsparseSolver
     /// \param[in] platformID                 the OpenCL platform to be used
     /// \param[in] deviceID                   the device to be used
-    rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
+    rocsparseSolverBackend(int linear_solver_verbosity,
+                           int maxit,
+                           Scalar tolerance,
+                           unsigned int platformID,
+                           unsigned int deviceID);
 
     /// For the CPR coarse solver
     // rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
@@ -138,10 +138,10 @@ public:
     /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
     /// \param[inout] res         summary of solver result
     /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-                              double* b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-                              WellContributions<double>& wellContribs,
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
                               BdaResult& res) override;
 
     /// Solve scalar linear system, for example a coarse system of an AMG preconditioner
@@ -150,13 +150,10 @@ public:
 
     /// Get result after linear solve, and peform postprocessing if necessary
     /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;
 
 }; // end class rocsparseSolverBackend
 
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
 
 #endif
-
-