rocsparseWellContributions: template Scalar type

2025-02-25 18:55:30 -06:00 · 2024-04-15 16:57:54 +02:00 · 2024-04-15 16:57:54 +02:00 · d2637e53ea
commit d2637e53ea
parent ddcfcf681d
5 changed files with 125 additions and 85 deletions
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -60,10 +60,10 @@ WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useW
        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
 #endif
    }
-    else if(accelerator_mode.compare("rocsparse") == 0){
+    else if (accelerator_mode.compare("rocsparse") == 0) {
        if (!useWellConn) {
 #if HAVE_ROCSPARSE
-            return std::make_unique<WellContributionsRocsparse>();
+            return std::make_unique<WellContributionsRocsparse<Scalar>>();
 #else
        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
 #endif
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@ -308,10 +308,10 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
        }

        // apply wellContributions
-        if (wellContribs.getNumWells() > 0){
+        if (wellContribs.getNumWells() > 0) {
            static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_pw, d_v);
        }
-        if(verbosity >= 3) {
+        if (verbosity >= 3) {
            queue->finish();
            t_well.stop();
            t_rest.start();
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@ -164,8 +164,8 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,

    // set stream here, the WellContributions object is destroyed every linear solve
    // the number of wells can change every linear solve
-    if(wellContribs.getNumWells() > 0){
-        static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
+    if (wellContribs.getNumWells() > 0) {
+        static_cast<WellContributionsRocsparse<double>&>(wellContribs).setStream(stream);
    }

 // HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
@ -254,8 +254,8 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
        }

        // apply wellContributions
-        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
+        if (wellContribs.getNumWells() > 0) {
+            static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_pw, d_v);
        }
        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
@ -313,15 +313,15 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
                                            d_Avals, d_Arows, d_Acols, block_size,
                                            d_s, &zero, d_t));
 #endif
-        if(verbosity >= 3){
+        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
            t_spmv.stop();
            t_well.start();
        }

        // apply wellContributions
-        if(wellContribs.getNumWells() > 0){
-            static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
+        if (wellContribs.getNumWells() > 0) {
+            static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_s, d_t);
        }
        if (verbosity >= 3) {
            HIP_CHECK(hipStreamSynchronize(stream));
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@ -56,17 +56,17 @@ namespace Opm

 #ifdef __HIP__
 /// HIP kernel to apply the standard wellcontributions
-__global__ void stdwell_apply(
-            const double *Cnnzs,
-            const double *Dnnzs,
-            const double *Bnnzs,
-            const unsigned *Ccols,
-            const unsigned *Bcols,
-            const double *x,
-            double *y,
-            const unsigned dim,
-            const unsigned dim_wells,
-            const unsigned *val_pointers)
+template<class Scalar>
+__global__ void stdwell_apply(const Scalar* Cnnzs,
+                              const Scalar* Dnnzs,
+                              const Scalar* Bnnzs,
+                              const unsigned* Ccols,
+                              const unsigned* Bcols,
+                              const Scalar* x,
+                              Scalar* y,
+                              const unsigned dim,
+                              const unsigned dim_wells,
+                              const unsigned *val_pointers)
 {
    unsigned wgId = blockIdx.x;
    unsigned wiId = threadIdx.x;
@ -76,16 +76,16 @@ __global__ void stdwell_apply(
    unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
    unsigned c = wiId % dim;
    unsigned r = (wiId/dim) % dim_wells;
-    double temp;
+    Scalar temp;

-    extern __shared__ double localSum[];
-    double *z1 = localSum + gridDim.x;
-    double *z2 = z1 + dim_wells;
+    extern __shared__ Scalar localSum[];
+    Scalar* z1 = localSum + gridDim.x;
+    Scalar* z2 = z1 + dim_wells;

    localSum[wiId] = 0;
-    if(wiId < numActiveWorkItems){
+    if (wiId < numActiveWorkItems) {
        unsigned b = wiId/valsPerBlock + val_pointers[wgId];
-        while(b < valSize + val_pointers[wgId]){
+        while (b < valSize + val_pointers[wgId]) {
            int colIdx = Bcols[b];
            localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
            b += numBlocksPerWarp;
@ -99,14 +99,14 @@ __global__ void stdwell_apply(
        //  6  7  8     18 19 20
        //  9 10 11     21 22 23
        // workitem i will hold the sum of workitems i and i + valsPerBlock
-        if(wiId < valsPerBlock){
+        if (wiId < valsPerBlock){
            for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
                localSum[wiId] += localSum[wiId + i*valsPerBlock];
 	    }
        }

-        if(c == 0 && wiId < valsPerBlock){
-            for(unsigned i = dim - 1; i > 0; --i){
+        if (c == 0 && wiId < valsPerBlock){
+            for (unsigned i = dim - 1; i > 0; --i) {
                localSum[wiId] += localSum[wiId + i];
            }
            z1[r] = localSum[wiId];
@ -117,7 +117,7 @@ __global__ void stdwell_apply(

    if(wiId < dim_wells){
        temp = 0.0;
-        for(unsigned i = 0; i < dim_wells; ++i){
+        for (unsigned i = 0; i < dim_wells; ++i) {
            temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
        }
        z2[wiId] = temp;
@ -125,10 +125,10 @@ __global__ void stdwell_apply(

    __syncthreads();

-    if(wiId < dim*valSize){
+    if (wiId < dim*valSize){
        temp = 0.0;
        unsigned bb = wiId/dim + val_pointers[wgId];
-        for (unsigned j = 0; j < dim_wells; ++j){
+        for (unsigned j = 0; j < dim_wells; ++j) {
            temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
        }

@ -138,17 +138,26 @@ __global__ void stdwell_apply(
 }
 #endif

-
-void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
-                                                [[maybe_unused]] double *d_y){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+apply_stdwells([[maybe_unused]] Scalar* d_x,
+               [[maybe_unused]] Scalar* d_y)
+{
 #ifdef __HIP__
    unsigned gridDim = num_std_wells;
    unsigned blockDim = 64;
-    unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
+    unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(Scalar); // shared memory for localSum, z1 and z2
    // dim3(N) will create a vector {N, 1, 1}
-    stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
-        d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
-        d_x, d_y, dim, dim_wells, d_val_pointers_hip
+    stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(d_Cnnzs_hip,
+                                                                              d_Dnnzs_hip,
+                                                                              d_Bnnzs_hip,
+                                                                              d_Ccols_hip,
+                                                                              d_Bcols_hip,
+                                                                              d_x,
+                                                                              d_y,
+                                                                              dim,
+                                                                              dim_wells,
+                                                                              d_val_pointers_hip
    );
    HIP_CHECK(hipStreamSynchronize(stream));
 #else
@ -156,67 +165,89 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
 #endif
 }

-void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+apply_mswells(Scalar* d_x, Scalar* d_y)
+{
    if (h_x.empty()) {
-        h_x.resize(N);
-        h_y.resize(N);
+        h_x.resize(this->N);
+        h_y.resize(this->N);
    }

-    HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
-    HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
    HIP_CHECK(hipStreamSynchronize(stream));

    // actually apply MultisegmentWells
-    for (auto& well : multisegments) {
+    for (auto& well : this->multisegments) {
        well->apply(h_x.data(), h_y.data());
    }

    // copy vector y from CPU to GPU
-    HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, stream));
    HIP_CHECK(hipStreamSynchronize(stream));
 }

-void WellContributionsRocsparse::apply(double *d_x, double *d_y){
-    if(num_std_wells > 0){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+apply(Scalar* d_x, Scalar* d_y)
+{
+    if (this->num_std_wells > 0) {
        apply_stdwells(d_x, d_y);
    }

-    if(num_ms_wells > 0){
+    if (this->num_ms_wells > 0) {
        apply_mswells(d_x, d_y);
    }
 }

-void WellContributionsRocsparse::setStream(hipStream_t stream_){
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::setStream(hipStream_t stream_)
+{
    stream = stream_;
 }

-void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
-                                              int* colIndices,
-                                              double* values,
-                                              unsigned int val_size)
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::
+APIaddMatrix(MatrixType type,
+             int* colIndices,
+             Scalar* values,
+             unsigned int val_size)
 {
-    if (!allocated) {
+    if (!this->allocated) {
        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

    switch (type) {
    case MatrixType::C:
-        HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
+                                 values, sizeof(d_Cnnzs_hip) * val_size * this->dim * this->dim_wells,
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + this->num_blocks_so_far, colIndices,
+                                 sizeof(d_Ccols_hip) * val_size,
+                                 hipMemcpyHostToDevice, stream));
        break;

    case MatrixType::D:
-        HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
+                                 values, sizeof(d_Dnnzs_hip) * this->dim_wells * this->dim_wells,
+                                 hipMemcpyHostToDevice, stream));
        break;

    case MatrixType::B:
-        HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
-        HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
+                                 values, sizeof(d_Bnnzs_hip) * val_size * this->dim * this->dim_wells,
+                                 hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + this->num_blocks_so_far, colIndices,
+                                 sizeof(d_Bcols_hip) * val_size,
+                                 hipMemcpyHostToDevice, stream));

-        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
-        if (num_std_wells_so_far == num_std_wells - 1) {
-            val_pointers[num_std_wells] = num_blocks;
-            HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
+        this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
+        if (this->num_std_wells_so_far == this->num_std_wells - 1) {
+            this->val_pointers[this->num_std_wells] = this->num_blocks;
+            HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, this->val_pointers.data(),
+                                     sizeof(d_val_pointers_hip) * (this->num_std_wells + 1),
+                                     hipMemcpyHostToDevice, stream));
        }
        break;

@ -226,14 +257,21 @@ void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
    HIP_CHECK(hipStreamSynchronize(stream));
 }

-void WellContributionsRocsparse::APIalloc()
+template<class Scalar>
+void WellContributionsRocsparse<Scalar>::APIalloc()
 {
-    HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
-    HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
-    HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
-    HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
-    HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
-    HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
+    HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip,
+                        sizeof(d_Cnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip,
+                        sizeof(d_Dnnzs_hip) * this->num_std_wells * this->dim_wells * this->dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip,
+                        sizeof(d_Bnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * this->num_blocks));
+    HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * this->num_blocks));
+    HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip,
+                        sizeof(d_val_pointers_hip) * (this->num_std_wells + 1)));
 }

-} //namespace Opm
+template class WellContributionsRocsparse<double>;
+
+} // namespace Opm
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
@ -26,33 +26,35 @@

 #include <vector>

+namespace Opm {

-namespace Opm
-{
-
-class WellContributionsRocsparse : public WellContributions<double>
+template<class Scalar>
+class WellContributionsRocsparse : public WellContributions<Scalar>
 {
 private:
    hipStream_t stream;

 public:
-    void apply_stdwells(double *d_x, double *d_y);
-    void apply_mswells(double *d_x, double *d_y);
-    void apply(double *d_x, double *d_y);
+    void apply_stdwells(Scalar* d_x, Scalar* d_y);
+    void apply_mswells(Scalar* d_x, Scalar* d_y);
+    void apply(Scalar* d_x, Scalar* d_y);
    void setStream(hipStream_t stream);

 protected:
    /// Allocate memory for the StandardWells
    void APIalloc() override;

-    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+    using MatrixType = typename WellContributions<Scalar>::MatrixType;

-    double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
+    void APIaddMatrix(MatrixType type, int* colIndices,
+                      Scalar* values, unsigned int val_size) override;
+
+    Scalar *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
    unsigned *d_Ccols_hip, *d_Bcols_hip;
    unsigned *d_val_pointers_hip;

-    std::vector<double> h_x;
-    std::vector<double> h_y;
+    std::vector<Scalar> h_x;
+    std::vector<Scalar> h_y;
 };

 } //namespace Opm