cusparseSolverBackend: template Scalar type

2025-02-25 18:55:30 -06:00 · 2024-04-16 10:29:33 +02:00 · 2024-04-16 10:29:33 +02:00 · 23250b87e3
commit 23250b87e3
parent 18f42b51b2
3 changed files with 154 additions and 147 deletions
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -73,7 +73,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
-        backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
+        using CU = Accelerator::cusparseSolverBackend<double,block_size>;
+        backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -44,23 +44,18 @@
 extern std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

-using Opm::OpmLog;
 using Dune::Timer;

 const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
 const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
 const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;

-
-template <unsigned int block_size>
-cusparseSolverBackend<block_size>::
+template<class Scalar, unsigned int block_size>
+cusparseSolverBackend<Scalar, block_size>::
 cusparseSolverBackend(int verbosity_, int maxit_,
-                      double tolerance_, unsigned int deviceID_)
+                      Scalar tolerance_, unsigned int deviceID_)
    : Base(verbosity_, maxit_, tolerance_, deviceID_)
 {
    // initialize CUDA device, stream and libraries
@ -70,7 +65,8 @@ cusparseSolverBackend(int verbosity_, int maxit_,
    cudaGetDeviceProperties(&props, deviceID);
    cudaCheckLastError("Could not get device properties");
    std::ostringstream out;
-    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+    out << "Name GPU: " << props.name << ", Compute Capability: "
+        << props.major << "." << props.minor;
    OpmLog::info(out.str());

    cudaStreamCreate(&stream);
@ -87,28 +83,29 @@ cusparseSolverBackend(int verbosity_, int maxit_,
    cudaCheckLastError("Could not set stream to cusparse");
 }

-template <unsigned int block_size>
-cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
+template<class Scalar, unsigned int block_size>
+cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
+{
    finalize();
 }

-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
    int n = N;
-    double rho = 1.0, rhop;
-    double alpha, nalpha, beta;
-    double omega, nomega, tmp1, tmp2;
-    double norm, norm_0;
-    double zero = 0.0;
-    double one  = 1.0;
-    double mone = -1.0;
+    Scalar rho = 1.0, rhop;
+    Scalar alpha, nalpha, beta;
+    Scalar omega, nomega, tmp1, tmp2;
+    Scalar norm, norm_0;
+    Scalar zero = 0.0;
+    Scalar one  = 1.0;
+    Scalar mone = -1.0;
    float it;

    if (wellContribs.getNumWells() > 0) {
-        static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
+        static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
    }

    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -152,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
        }

        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -183,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
        }

        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@ -195,7 +192,6 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        cublasDnrm2(cublasHandle, n, d_r, 1, &norm);

-
        if (norm < tolerance * norm_0) {
            break;
        }
@ -215,16 +211,17 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

    if (verbosity > 0) {
        std::ostringstream out;
-        out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-            ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
+        out << "=== converged: " << res.converged << ", conv_rate: "
+            << res.conv_rate << ", time: " << res.elapsed
+            << ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
        OpmLog::info(out.str());
    }
 }

-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-           std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+           std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    this->Nb = matrix->Nb;
    this->N = Nb * block_size;
@ -239,46 +236,49 @@ initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
    }

    std::ostringstream out;
-    out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
+    out << "Initializing GPU, matrix size: " << Nb
+        << " blockrows, nnz: " << nnzb << " blocks\n";
    if (useJacMatrix) {
        out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
    }
-    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "Maxit: " << maxit << std::scientific
+        << ", tolerance: " << tolerance << "\n";
    OpmLog::info(out.str());

-    cudaMalloc((void**)&d_x, sizeof(double) * N);
-    cudaMalloc((void**)&d_b, sizeof(double) * N);
-    cudaMalloc((void**)&d_r, sizeof(double) * N);
-    cudaMalloc((void**)&d_rw, sizeof(double) * N);
-    cudaMalloc((void**)&d_p, sizeof(double) * N);
-    cudaMalloc((void**)&d_pw, sizeof(double) * N);
-    cudaMalloc((void**)&d_s, sizeof(double) * N);
-    cudaMalloc((void**)&d_t, sizeof(double) * N);
-    cudaMalloc((void**)&d_v, sizeof(double) * N);
-    cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+    cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
+    cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
    cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
    cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
    if (useJacMatrix) {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
        cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
        cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
    } else {
-        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
        d_mCols = d_bCols;
        d_mRows = d_bRows;
    }
    cudaCheckLastError("Could not allocate enough memory on GPU");

 #if COPY_ROW_BY_ROW
-    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
+    cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
    cudaCheckLastError("Could not allocate pinned memory");
 #endif

    initialized = true;
 } // end initialize()

-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::finalize() {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::finalize()
+{
    if (initialized) {
        cudaFree(d_x);
        cudaFree(d_b);
@ -314,44 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
    }
 } // end finalize()

-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                   double *b,
-                   std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                   Scalar* b,
+                   std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    Timer t;

-    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);

 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
 	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
+                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
+                        cudaMemcpyHostToDevice, stream);
    } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals,
+                        nnz  * sizeof(Scalar),
+                        cudaMemcpyDeviceToDevice, stream);
    }
 #endif

    if (useJacMatrix) {
-        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
-        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
+                        cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
+                        cudaMemcpyHostToDevice, stream);
    }

    if (verbosity >= 3) {
@ -364,37 +374,43 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
    }
 } // end copy_system_to_gpu()

-
 // don't copy rowpointers and colindices, they stay the same
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                     double *b,
-                     std::shared_ptr<BlockedMatrix<double>> jacMatrix)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                     Scalar* b,
+                     std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
 {
    Timer t;

-    cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-    cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+    cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
    
 #if COPY_ROW_BY_ROW
    int sum = 0;
    for (int i = 0; i < Nb; ++i) {
        int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
-        memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
+        memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
+               size_row * sizeof(Scalar) * block_size * block_size);
        sum += size_row * block_size * block_size;
    }
-    cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, vals_contiguous,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
 #else
-    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
+                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+        if (omp_get_max_threads() > 1) {
+           copyThread->join();
+        }
 #endif
-        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, 
+                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
+                        cudaMemcpyHostToDevice, stream);
    } else {
-        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(Scalar),
+                        cudaMemcpyDeviceToDevice, stream);
    }
 #endif

@ -409,10 +425,9 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
    }
 } // end update_system_on_gpu()

-
-template <unsigned int block_size>
-bool cusparseSolverBackend<block_size>::analyse_matrix() {
-
+template<class Scalar, unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
+{
    int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
    Timer t;

@ -487,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
    return true;
 } // end analyse_matrix()

-template <unsigned int block_size>
-bool cusparseSolverBackend<block_size>::create_preconditioner() {
+template<class Scalar, unsigned int block_size>
+bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
+{
    Timer t;

    cusparseDbsrilu02(cusparseHandle, order, \
@ -512,10 +528,9 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
    return true;
 } // end create_preconditioner()

-
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::
-solve_system(WellContributions<double>& wellContribs, BdaResult& res)
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::
+solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
 {
    // actually solve
    gpu_pbicgstab(wellContribs, res);
@ -523,14 +538,14 @@ solve_system(WellContributions<double>& wellContribs, BdaResult& res)
    cudaCheckLastError("Something went wrong during the GPU solve");
 } // end solve_system()

-
 // copy result to host memory
 // caller must be sure that x is a valid array
-template <unsigned int block_size>
-void cusparseSolverBackend<block_size>::get_result(double *x) {
+template<class Scalar, unsigned int block_size>
+void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
+{
    Timer t;

-    cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+    cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    if (verbosity > 2) {
@ -540,12 +555,12 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
    }
 } // end get_result()

-template <unsigned int block_size>
-SolverStatus cusparseSolverBackend<block_size>::
-solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-             double *b,
-             std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-             WellContributions<double>& wellContribs,
+template<class Scalar, unsigned int block_size>
+SolverStatus cusparseSolverBackend<Scalar,block_size>::
+solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+             Scalar* b,
+             std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+             WellContributions<Scalar>& wellContribs,
             BdaResult& res)
 {
    if (initialized == false) {
@ -567,18 +582,14 @@ solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }

+#define INSTANTIATE_TYPE(T)                    \
+    template class cusparseSolverBackend<T,1>; \
+    template class cusparseSolverBackend<T,2>; \
+    template class cusparseSolverBackend<T,3>; \
+    template class cusparseSolverBackend<T,4>; \
+    template class cusparseSolverBackend<T,5>; \
+    template class cusparseSolverBackend<T,6>;

-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                       \
-template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int);  \
+INSTANTIATE_TYPE(double)

-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
-
-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
@ -28,16 +28,13 @@
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>

-namespace Opm
-{
-namespace Accelerator
-{
+namespace Opm::Accelerator {

 /// This class implements a cusparse-based ilu0-bicgstab solver on GPU
-template <unsigned int block_size>
-class cusparseSolverBackend : public BdaSolver<double,block_size> {
-
-    using Base = BdaSolver<double,block_size>;
+template<class Scalar, unsigned int block_size>
+class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
+{
+    using Base = BdaSolver<Scalar,block_size>;

    using Base::N;
    using Base::Nb;
@ -57,13 +54,13 @@ private:
    bsrilu02Info_t info_M;
    bsrsv2Info_t info_L, info_U;
    // b: bsr matrix, m: preconditioner
-    double *d_bVals, *d_mVals;
+    Scalar *d_bVals, *d_mVals;
    int *d_bCols, *d_mCols;
    int *d_bRows, *d_mRows;
-    double *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
-    double *d_pw, *d_s, *d_t, *d_v;
+    Scalar *d_x, *d_b, *d_r, *d_rw, *d_p;     // vectors, used during linear solve
+    Scalar *d_pw, *d_s, *d_t, *d_v;
    void *d_buffer;
-    double *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
+    Scalar *vals_contiguous;                  // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp

    bool analysis_done = false;

@ -76,13 +73,13 @@ private:
    /// Solve linear system using ilu0-bicgstab
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res);
+    void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);

    /// Initialize GPU and allocate memory
    /// \param[in] matrix         matrix for spmv
    /// \param[in] jacMatrix      matrix for preconditioner
-    void initialize(std::shared_ptr<BlockedMatrix<double>> matrix,
-                    std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                    std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Clean memory
    void finalize();
@ -92,18 +89,18 @@ private:
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                            double *b,
-                            std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                            Scalar* b,
+                            std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
    /// also copy matrix for preconditioner if needed
    /// \param[in] matrix         matrix for spmv
    /// \param[in] b              input vector, contains N values
    /// \param[in] jacMatrix      matrix for preconditioner
-    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<double>> matrix,
-                              double *b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix);
+    void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);

    /// Analyse sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
@ -116,17 +113,16 @@ private:
    /// Solve linear system
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
-    void solve_system(WellContributions<double>& wellContribs, BdaResult &res);
+    void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);

 public:
-
-
    /// Construct a cusparseSolver
    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
    /// \param[in] deviceID                   the device to be used
-    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
+    cusparseSolverBackend(int linear_solver_verbosity, int maxit,
+                          Scalar tolerance, unsigned int deviceID);

    /// Destroy a cusparseSolver, and free memory
    ~cusparseSolverBackend();
@ -138,20 +134,19 @@ public:
    /// \param[in] wellContribs   contains all WellContributions, to apply them separately, instead of adding them to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
-    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<double>> matrix,
-                              double *b,
-                              std::shared_ptr<BlockedMatrix<double>> jacMatrix,
-                              WellContributions<double>& wellContribs,
+    SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
+                              Scalar* b,
+                              std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
+                              WellContributions<Scalar>& wellContribs,
                              BdaResult& res) override;
    
    /// Get resulting vector x after linear solve, also includes post processing if necessary
    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
-    void get_result(double *x) override;
+    void get_result(Scalar* x) override;

 }; // end class cusparseSolverBackend

-} // namespace Accelerator
-} // namespace Opm
+} // namespace Opm::Accelerator

 #endif