cuWellContributions: template Scalar type

2025-02-25 18:55:30 -06:00 · 2024-04-15 16:57:54 +02:00 · 2024-04-15 16:57:54 +02:00 · 05a89d1e96
commit 05a89d1e96
parent d2637e53ea
4 changed files with 106 additions and 75 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -46,11 +46,11 @@ template<class Scalar>
 std::unique_ptr<WellContributions<Scalar>>
 WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useWellConn)
 {
-    if(accelerator_mode.compare("cusparse") == 0){
+    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
-    return std::make_unique<WellContributionsCuda>();
+        return std::make_unique<WellContributionsCuda<Scalar>>();
 #else
-    OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
+        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
 #endif
    }
    else if (accelerator_mode.compare("opencl") == 0) {
@ -69,7 +69,6 @@ WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useW
 #endif
        }
        return std::make_unique<WellContributions>();
-
    }
    else if(accelerator_mode.compare("amgcl") == 0){
        if (!useWellConn) {
--- a/opm/simulators/linalg/bda/cuda/cuWellContributions.cu
+++ b/opm/simulators/linalg/bda/cuda/cuWellContributions.cu
@ -33,18 +33,17 @@ namespace Opm
 {

 // apply WellContributions using y -= C^T * (D^-1 * (B * x))
-__global__ void apply_well_contributions(
-    const double * __restrict__ Cnnzs,
-    const double * __restrict__ Dnnzs,
-    const double * __restrict__ Bnnzs,
-    const int * __restrict__ Ccols,
-    const int * __restrict__ Bcols,
-    const double * __restrict__ x,
-    double * __restrict__ y,
-    const int dim,
-    const int dim_wells,
-    const unsigned int * __restrict__ val_pointers
-)
+template<class Scalar>
+__global__ void apply_well_contributions(const Scalar* __restrict__ Cnnzs,
+                                         const Scalar* __restrict__ Dnnzs,
+                                         const Scalar* __restrict__ Bnnzs,
+                                         const int* __restrict__ Ccols,
+                                         const int* __restrict__ Bcols,
+                                         const Scalar* __restrict__ x,
+                                         Scalar* __restrict__ y,
+                                         const int dim,
+                                         const int dim_wells,
+                                         const unsigned int * __restrict__ val_pointers)
 {
    const int idx_b = blockIdx.x;
    const int idx_t = threadIdx.x;
@ -57,9 +56,9 @@ __global__ void apply_well_contributions(
    const int c = lane % dim;                           // col in block
    const int r = (lane / dim) % dim_wells;             // row in block

-    extern __shared__ double smem[];
-    double * __restrict__ z1 = smem;
-    double * __restrict__ z2 = z1 + dim_wells;
+    extern __shared__ unsigned char smem[];
+    Scalar* __restrict__ z1 = reinterpret_cast<Scalar*>(smem);
+    Scalar* __restrict__ z2 = z1 + dim_wells;

    if (idx_t < dim_wells) {
        z1[idx_t] = 0.0;
@ -70,7 +69,7 @@ __global__ void apply_well_contributions(
    // z1 = B * x
    if (idx_t < num_active_threads) {
        // multiply all blocks with x
-        double temp = 0.0;
+        Scalar temp = 0.0;
        int b = idx_t / vals_per_block + val_pointers[idx_b];       // block id, val_size indicates number of blocks
        while (b < val_size + val_pointers[idx_b]) {
            int colIdx = Bcols[b];
@ -106,7 +105,7 @@ __global__ void apply_well_contributions(

    // z2 = D^-1 * B * x = D^-1 * z1
    if (idx_t < dim_wells) {
-        double temp = 0.0;
+        Scalar temp = 0.0;
        for (int c = 0; c < dim_wells; ++c) {
            temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
        }
@ -118,7 +117,7 @@ __global__ void apply_well_contributions(
    // y -= C^T * D^-1 * B * x
    // use dim * val_size threads, each block is assigned 'dim' threads
    if (idx_t < dim * val_size) {
-        double temp = 0.0;
+        Scalar temp = 0.0;
        int b = idx_t / dim + val_pointers[idx_b];
        int cc = idx_t % dim;
        int colIdx = Ccols[b];
@ -127,13 +126,13 @@ __global__ void apply_well_contributions(
        }
        y[colIdx * dim + cc] -= temp;
    }
-
 }

-WellContributionsCuda::~WellContributionsCuda()
+template<class Scalar>
+WellContributionsCuda<Scalar>::~WellContributionsCuda()
 {
    // delete data for StandardWell
-    if (num_std_wells > 0) {
+    if (this->num_std_wells > 0) {
        cudaFree(d_Cnnzs);
        cudaFree(d_Dnnzs);
        cudaFree(d_Bnnzs);
@ -142,80 +141,108 @@ WellContributionsCuda::~WellContributionsCuda()
        cudaFree(d_val_pointers);
    }

-    if (num_ms_wells > 0 && h_x) {
+    if (this->num_ms_wells > 0 && h_x) {
        cudaFreeHost(h_x);
        cudaFreeHost(h_y);
        h_x = h_y = nullptr; // Mark as free for constructor
    }
 }

-void WellContributionsCuda::APIalloc()
+template<class Scalar>
+void WellContributionsCuda<Scalar>::APIalloc()
 {
-    cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
-    cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
-    cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
-    cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
+    cudaMalloc((void**)&d_Cnnzs,
+               sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
+    cudaMalloc((void**)&d_Dnnzs,
+               sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
+    cudaMalloc((void**)&d_Bnnzs,
+               sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
+    cudaMalloc((void**)&d_Ccols, sizeof(int) * this->num_blocks);
+    cudaMalloc((void**)&d_Bcols, sizeof(int) * this->num_blocks);
+    cudaMalloc((void**)&this->d_val_pointers, sizeof(unsigned int) * (this->num_std_wells + 1));
    cudaCheckLastError("apply_gpu malloc failed");
 }

 // Apply the WellContributions, similar to StandardWell::apply()
 // y -= (C^T *(D^-1*(   B*x)))
-void WellContributionsCuda::apply(double *d_x, double *d_y)
+template<class Scalar>
+void WellContributionsCuda<Scalar>::apply(Scalar* d_x, Scalar* d_y)
 {
    // apply MultisegmentWells

    // make sure the stream is empty if timing measurements are done
    cudaStreamSynchronize(stream);

-    if (num_ms_wells > 0) {
+    if (this->num_ms_wells > 0) {
        // allocate pinned memory on host if not yet done
        if (h_x == nullptr) {
-            cudaMallocHost(&h_x, sizeof(double) * N);
-            cudaMallocHost(&h_y, sizeof(double) * N);
+            cudaMallocHost(&h_x, sizeof(Scalar) * this->N);
+            cudaMallocHost(&h_y, sizeof(Scalar) * this->N);
        }

        // copy vectors x and y from GPU to CPU
-        cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+        cudaMemcpyAsync(h_x, d_x, sizeof(Scalar) * this->N,
+                        cudaMemcpyDeviceToHost, stream);
+        cudaMemcpyAsync(h_y, d_y, sizeof(Scalar) * this->N,
+                        cudaMemcpyDeviceToHost, stream);
        cudaStreamSynchronize(stream);

        // actually apply MultisegmentWells
-        for (auto& well : multisegments) {
+        for (auto& well : this->multisegments) {
            well->apply(h_x, h_y);
        }

        // copy vector y from CPU to GPU
-        cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_y, h_y, sizeof(Scalar) * this->N,
+                        cudaMemcpyHostToDevice, stream);
        cudaStreamSynchronize(stream);
    }

    // apply StandardWells
-    if (num_std_wells > 0) {
-        int smem_size = 2 * sizeof(double) * dim_wells;
-        apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
+    if (this->num_std_wells > 0) {
+        int smem_size = 2 * sizeof(Scalar) * this->dim_wells;
+        apply_well_contributions <<< this->num_std_wells, 32, smem_size, stream>>>(d_Cnnzs,
+                                                                                   d_Dnnzs,
+                                                                                   d_Bnnzs,
+                                                                                   d_Ccols,
+                                                                                   d_Bcols,
+                                                                                   d_x,
+                                                                                   d_y,
+                                                                                   this->dim,
+                                                                                   this->dim_wells,
+                                                                                   this->d_val_pointers);
    }
 }

-
-void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+template<class Scalar>
+void WellContributionsCuda<Scalar>::APIaddMatrix(MatrixType type, int* colIndices,
+                                                 Scalar* values, unsigned int val_size)
 {
    switch (type) {
    case MatrixType::C:
-        cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Cnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
+                   values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Ccols + this->num_blocks_so_far, colIndices,
+                   sizeof(int) * val_size, cudaMemcpyHostToDevice);
        break;
    case MatrixType::D:
-        cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Dnnzs + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
+                   values, sizeof(Scalar) * this->dim_wells * this->dim_wells,
+                   cudaMemcpyHostToDevice);
        break;
    case MatrixType::B:
-        cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
-        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-        if (num_std_wells_so_far == num_std_wells - 1) {
-            val_pointers[num_std_wells] = num_blocks;
-            cudaMemcpy(d_val_pointers, val_pointers.data(), sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Bnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
+                   values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(d_Bcols + this->num_blocks_so_far, colIndices,
+                   sizeof(int) * val_size, cudaMemcpyHostToDevice);
+        this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
+        if (this->num_std_wells_so_far == this->num_std_wells - 1) {
+            this->val_pointers[this->num_std_wells] = this->num_blocks;
+            cudaMemcpy(d_val_pointers, this->val_pointers.data(),
+                       sizeof(unsigned int) * (this->num_std_wells + 1),
+                       cudaMemcpyHostToDevice);
        }
        break;
    default:
@ -224,13 +251,16 @@ void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, doubl
    cudaCheckLastError("WellContributions::addMatrix() failed");
 }

-void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
+template<class Scalar>
+void WellContributionsCuda<Scalar>::setCudaStream(cudaStream_t stream_)
 {
    this->stream = stream_;
-    for (auto& well : multisegments) {
+    for (auto& well : this->multisegments) {
        well->setCudaStream(stream_);
    }
 }

+template class WellContributionsCuda<double>;
+
 } //namespace Opm

--- a/opm/simulators/linalg/bda/cuda/cuWellContributions.hpp
+++ b/opm/simulators/linalg/bda/cuda/cuWellContributions.hpp
@ -25,10 +25,10 @@
 #include <cuda_runtime.h>


-namespace Opm
-{
+namespace Opm {

-class WellContributionsCuda : public WellContributions<double>
+template<class Scalar>
+class WellContributionsCuda : public WellContributions<Scalar>
 {
 public:
    ~WellContributionsCuda() override;
@ -41,33 +41,35 @@ public:
    /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
    /// \param[in] d_x        vector x, must be on GPU
    /// \param[inout] d_y     vector y, must be on GPU
-    void apply(double *d_x, double *d_y);
+    void apply(Scalar* d_x, Scalar* d_y);

 protected:
    /// Allocate memory for the StandardWells
    void APIalloc() override;

+    using MatrixType = typename WellContributions<Scalar>::MatrixType;
+
    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
    /// \param[in] type        indicate if C, D or B is sent
    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
    /// \param[in] values      array of nonzeroes
    /// \param[in] val_size    number of blocks in C or B, ignored for D
-    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+    void APIaddMatrix(MatrixType type, int* colIndices,
+                      Scalar* values, unsigned int val_size) override;

    cudaStream_t stream;

    // data for StandardWells, could remain nullptrs if not used
-    double *d_Cnnzs = nullptr;
-    double *d_Dnnzs = nullptr;
-    double *d_Bnnzs = nullptr;
-    int *d_Ccols = nullptr;
-    int *d_Bcols = nullptr;
-    double *d_z1 = nullptr;
-    double *d_z2 = nullptr;
+    Scalar* d_Cnnzs = nullptr;
+    Scalar* d_Dnnzs = nullptr;
+    Scalar* d_Bnnzs = nullptr;
+    int* d_Ccols = nullptr;
+    int* d_Bcols = nullptr;
+    Scalar* d_z1 = nullptr;
+    Scalar* d_z2 = nullptr;
    unsigned int *d_val_pointers = nullptr;
-    double* h_x = nullptr;
-    double* h_y = nullptr;
-
+    Scalar* h_x = nullptr;
+    Scalar* h_y = nullptr;
 };

 } //namespace Opm
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -105,7 +105,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
    float it;

    if (wellContribs.getNumWells() > 0) {
-        static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
+        static_cast<WellContributionsCuda<double>&>(wellContribs).setCudaStream(stream);
    }

    cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -149,7 +149,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
+            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_pw, d_v);
        }

        cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -180,7 +180,7 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)

        // apply wellContributions
        if (wellContribs.getNumWells() > 0) {
-            static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
+            static_cast<WellContributionsCuda<double>&>(wellContribs).apply(d_s, d_t);
        }

        cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);