cusparseSolverBackend: template Scalar type

2025-02-25 18:55:30 -06:00 · 2024-04-16 10:29:33 +02:00 · 2024-04-16 10:29:33 +02:00 · 23250b87e3
commit 23250b87e3
parent 18f42b51b2
3 changed files with 154 additions and 147 deletions
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
-        backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
+        using CU = Accelerator::cusparseSolverBackend<double,block_size>;
        backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -44,23 +44,18 @@
 extern std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP
-namespace Opm
+namespace Opm::Accelerator {
 {
 namespace Accelerator
 {
 using Opm::OpmLog;
 using Dune::Timer;
 const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
 const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
 const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
-
+template<class Scalar, unsigned int block_size>
-template <unsigned int block_size>
+cusparseSolverBackend<Scalar, block_size>::
 cusparseSolverBackend<block_size>::
 cusparseSolverBackend(int verbosity_, int maxit_,
-                      double tolerance_, unsigned int deviceID_)
+                      Scalar tolerance_, unsigned int deviceID_)
    : Base(verbosity_, maxit_, tolerance_, deviceID_)
 {
    // initialize CUDA device, stream and libraries
@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
    cudaGetDeviceProperties(&props, deviceID);
    cudaCheckLastError("Could not get device properties");
    std::ostringstream out;
-    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+    out << "Name GPU: " << props.name << ", Compute Capability: "
        << props.major << "." << props.minor;
    OpmLog::info(out.str());
    cudaStreamCreate(&stream);
@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
    cudaCheckLastError("Could not set stream to cusparse");
 }
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
+cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
 {
    finalize();
 }
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-void cusparseSolverBackend<block_size>::
+void cusparseSolverBackend<Scalar,block_size>::
-gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
    int n = N;
-    double rho = 1.0, rhop;
+    Scalar rho = 1.0, rhop;
-    double alpha, nalpha, beta;
+    Scalar alpha, nalpha, beta;
-    double omega, nomega, tmp1, tmp2;
+    Scalar omega, nomega, tmp1, tmp2;
-    double norm, norm_0;
+    Scalar norm, norm_0;
-    double zero = 0.0;
+    Scalar zero = 0.0;
-    double one  = 1.0;
+    Scalar one  = 1.0;
-    double mone = -1.0;
+    Scalar mone = -1.0;
    float it;
    if (wellContribs.getNumWells() > 0) {
-        static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
+        static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
    }
    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
        }
        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
        }
        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
        if (norm < tolerance * norm_0) {
            break;
        }
@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
    if (verbosity > 0) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+        out << "=== converged: " << res.converged << ", conv_rate: "
-            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+            << res.conv_rate << ", time: " << res.elapsed
            << ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
    }
 }
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-void cusparseSolverBackend<block_size>::
+void cusparseSolverBackend<Scalar,block_size>::
-initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
    }
    std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
+    out << "Initializing GPU, matrix size: " << Nb
        << " blockrows, nnz: " << nnzb << " blocks\n";
    if (useJacMatrix) {
        out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
    }
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "Maxit: " << maxit << std::scientific
        << ", tolerance: " << tolerance << "\n";
    OpmLog::info(out.str());
-    cudaMalloc((void**)&d_x, sizeof(double) * N);
+    cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_b, sizeof(double) * N);
+    cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_r, sizeof(double) * N);
+    cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_rw, sizeof(double) * N);
+    cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_p, sizeof(double) * N);
+    cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_pw, sizeof(double) * N);
+    cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_s, sizeof(double) * N);
+    cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_t, sizeof(double) * N);
+    cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_v, sizeof(double) * N);
+    cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
-    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
    cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
    cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
    if (useJacMatrix) {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
        cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
        cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
    } else {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
        d_mCols = d_bCols;
        d_mRows = d_bRows;
    }
    cudaCheckLastError("Could not allocate enough memory on GPU");
 #if COPY_ROW_BY_ROW
-    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+    cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
    cudaCheckLastError("Could not allocate pinned memory");
 #endif
    initialized = true;
 } // end initialize()
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-void cusparseSolverBackend<block_size>::finalize() {
+void cusparseSolverBackend<Scalar,block_size>::finalize()
 {
    if (initialized) {
        cudaFree(d_x);
        cudaFree(d_b);
@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
    }
 } // end finalize()
-
+template<class Scalar, unsigned int block_size>
-template <unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
-void cusparseSolverBackend<block_size>::
+copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+                   Scalar* b,
-                   double *b,
+                   std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
                   std::shared_ptr<BlockedMatrix<double>> jacMatrix)
 {
    Timer t;
-    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
-    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+                    cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+                    cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
 	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
                        cudaMemcpyHostToDevice, stream);
    } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals,
                        nnz  * sizeof(Scalar),
                        cudaMemcpyDeviceToDevice, stream);
    }
 #endif
    if (useJacMatrix) {
-        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
-        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+                        cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
                        cudaMemcpyHostToDevice, stream);
    }
    if (verbosity >= 3) {
@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
    }
 } // end copy_system_to_gpu()
 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-void cusparseSolverBackend<block_size>::
+void cusparseSolverBackend<Scalar,block_size>::
-update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-                     double *b,
+                     Scalar* b,
-                     std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+                     std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    Timer t;
-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
+        if (omp_get_max_threads() > 1) {
-	   copyThread->join();
+           copyThread->join();
        }
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, 
                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
                        cudaMemcpyHostToDevice, stream);
    } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(Scalar),
                        cudaMemcpyDeviceToDevice, stream);
    }
 #endif
@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
    }
 } // end update_system_on_gpu()
-
+template<class Scalar, unsigned int block_size>
-template <unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
-bool cusparseSolverBackend<block_size>::analyse_matrix() {
+{
    int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    Timer t;
@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
    return true;
 } // end analyse_matrix()
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-bool cusparseSolverBackend<block_size>::create_preconditioner() {
+bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
 {
    Timer t;
    cusparseDbsrilu02(cusparseHandle, order, \
@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
    return true;
 } // end create_preconditioner()
-
+template<class Scalar, unsigned int block_size>
-template <unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
-void cusparseSolverBackend<block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
 solve_system(WellContributions<double>& wellContribs, BdaResult& res)
 {
    // actually solve
    gpu_pbicgstab(wellContribs, res);
@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
    cudaCheckLastError("Something went wrong during the GPU solve");
 } // end solve_system()
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-void cusparseSolverBackend<block_size>::get_result(double *x) {
+void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
 {
    Timer t;
-    cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+    cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
    if (verbosity > 2) {
@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::
+SolverStatus cusparseSolverBackend<Scalar,block_size>::
-solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-             double *b,
+             Scalar* b,
-             std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
-             WellContributions<double>& wellContribs,
+             WellContributions<Scalar>& wellContribs,
             BdaResult& res)
 {
    if (initialized == false) {
@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }
 #define INSTANTIATE_TYPE(T)                    \
    template class cusparseSolverBackend<T,1>; \
    template class cusparseSolverBackend<T,2>; \
    template class cusparseSolverBackend<T,3>; \
    template class cusparseSolverBackend<T,4>; \
    template class cusparseSolverBackend<T,5>; \
    template class cusparseSolverBackend<T,6>;
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                       \
+INSTANTIATE_TYPE(double)
 template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int);  \
-INSTANTIATE_BDA_FUNCTIONS(1);
+} // namespace Opm::Accelerator
 INSTANTIATE_BDA_FUNCTIONS(2);
 INSTANTIATE_BDA_FUNCTIONS(3);
 INSTANTIATE_BDA_FUNCTIONS(4);
 INSTANTIATE_BDA_FUNCTIONS(5);
 INSTANTIATE_BDA_FUNCTIONS(6);
 #undef INSTANTIATE_BDA_FUNCTIONS
 } // namespace Accelerator
 } // namespace Opm
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@ -28,16 +28,13 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
-namespace Opm
+namespace Opm::Accelerator {
 {
 namespace Accelerator
 {
 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
+template<class Scalar, unsigned int block_size>
-class cusparseSolverBackend : public BdaSolver<double,block_size> {
+class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
-
+{
-    using Base = BdaSolver<double,block_size>;
+    using Base = BdaSolver<Scalar,block_size>;
    using Base::N;
    using Base::Nb;
@ -57,13 +54,13 @@ private:
    bsrilu02Info_t info_M;
    bsrsv2Info_t info_L, info_U;
    // b: bsr matrix, m: preconditioner
-    double *d_bVals, *d_mVals;
+    Scalar *d_bVals, *d_mVals;
    int *d_bCols, *d_mCols;
    int *d_bRows, *d_mRows;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
-    double *d_pw, *d_s, *d_t, *d_v;
+    Scalar *d_pw, *d_s, *d_t, *d_v;
    void *d_buffer;
-    double *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
+    Scalar *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
    bool analysis_done = false;
@ -76,13 +73,13 @@ private:
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
    /// Initialize GPU and allocate memory
    /// \param[in] matrix         matrix for spmv
    /// \param[in] jacMatrix      matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
    /// Clean memory
    void finalize();
@ -92,18 +89,18 @@ private:
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-                            double *b,
+                            Scalar* b,
-                            std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+                            std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    /// also copy matrix for preconditioner if needed
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-                              double *b,
+                              Scalar* b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
    /// Analyse sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
@ -116,17 +113,16 @@ private:
    /// Solve linear system
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions<double>& wellContribs, BdaResult &res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
 public:
    /// Construct a cusparseSolver
    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
    /// \param[in] deviceID                   the device to be used
-    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
+    cusparseSolverBackend(int linear_solver_verbosity, int maxit,
                          Scalar tolerance, unsigned int deviceID);
    /// Destroy a cusparseSolver, and free memory
    ~cusparseSolverBackend();
@ -138,20 +134,19 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
-                              double *b,
+                              Scalar* b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
-                              WellContributions<double>& wellContribs,
+                              WellContributions<Scalar>& wellContribs,
                              BdaResult& res) override;
    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;
 }; // end class cusparseSolverBackend
-} // namespace Accelerator
+} // namespace Opm::Accelerator
 } // namespace Opm
 #endif