Now WellContributions copies vectors for MultisegmentWells to reduce number of copies

2025-02-10 03:15:34 -06:00 · 2020-05-18 18:48:58 +02:00 · 2020-05-18 18:48:58 +02:00 · f6036ffa8e
commit f6036ffa8e
parent 0f273ebdf6
4 changed files with 44 additions and 35 deletions
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
@ -19,16 +19,10 @@


 #include <cstdlib>
-#include <cstring>
 #include <config.h> // CMake
-#include <fstream>
-
-#include <opm/common/OpmLog/OpmLog.hpp>
-#include <opm/common/ErrorMacros.hpp>

 #if HAVE_UMFPACK
 #include <dune/istl/umfpack.hh>
-#include <opm/simulators/linalg/bda/cuda_header.hpp>
 #endif // HAVE_UMFPACK

 #include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
@ -63,33 +57,22 @@ namespace Opm
            z1.resize(Mb * dim_wells);
            z2.resize(Mb * dim_wells);

-            // allocate pinned memory on host
-            cudaMallocHost(&h_x, sizeof(double) * N);
-            cudaMallocHost(&h_y, sizeof(double) * N);
-
            umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
            umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
        }

        MultisegmentWellContribution::~MultisegmentWellContribution()
        {
-            cudaFreeHost(h_x);
-            cudaFreeHost(h_y);
-
            umfpack_di_free_symbolic(&UMFPACK_Symbolic);
            umfpack_di_free_numeric(&UMFPACK_Numeric);
        }


        // Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
+        // h_x and h_y reside on host
        // y -= (C^T * (D^-1 * (B * x)))
-        void MultisegmentWellContribution::apply(double *d_x, double *d_y)
+        void MultisegmentWellContribution::apply(double *h_x, double *h_y)
        {
-            // copy vectors x and y from GPU to CPU
-            cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-            cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
-            cudaStreamSynchronize(stream);
-
            // reset z1 and z2
            std::fill(z1.begin(), z1.end(), 0.0);
            std::fill(z2.begin(), z2.end(), 0.0);
@ -128,10 +111,6 @@ namespace Opm
                    }
                }
            }
-
-            // copy vector y from CPU to GPU
-            cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
-            cudaStreamSynchronize(stream);
        }

        void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@ -47,8 +47,7 @@ namespace Opm
        unsigned int M;                          // number of rows, M == dim_wells*Mb
        unsigned int Mb;                         // number of blockrows in C, D and B

-        cudaStream_t stream;
-        double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+        cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU

        // C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
        // Sparsity pattern for C is not stored, since it is the same as B
@ -96,11 +95,11 @@ namespace Opm
        /// Destroy a MultisegmentWellContribution, and free memory
        ~MultisegmentWellContribution();

-        /// Apply the MultisegmentWellContribution on GPU
+        /// Apply the MultisegmentWellContribution on CPU
        /// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
-        /// \param[in] d_x          vector x, must be on GPU
-        /// \param[inout] d_y       vector y, must be on GPU
-        void apply(double *d_x, double *d_y);
+        /// \param[in] h_x          vector x, must be on CPU
+        /// \param[inout] h_y       vector y, must be on CPU
+        void apply(double *h_x, double *h_y);

    };

--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@ -127,7 +127,6 @@ namespace Opm

    }

-
    void WellContributions::alloc()
    {
        if (num_std_wells > 0) {
@ -145,6 +144,12 @@ namespace Opm

    WellContributions::~WellContributions()
    {
+        // free pinned memory for MultisegmentWellContributions
+        if (h_x) {
+            cudaFreeHost(h_x);
+            cudaFreeHost(h_y);
+        }
+
        // delete MultisegmentWellContributions
        for (auto ms : multisegments) {
            delete ms;
@ -169,8 +174,29 @@ namespace Opm
    void WellContributions::apply(double *d_x, double *d_y)
    {
        // apply MultisegmentWells
-        for(MultisegmentWellContribution *well : multisegments){
-            well->apply(d_x, d_y);
+        if (num_ms_wells > 0) {
+            // allocate pinned memory on host if not yet done
+            if (h_x == nullptr) {
+                cudaMallocHost(&h_x, sizeof(double) * N);
+                cudaMallocHost(&h_y, sizeof(double) * N);
+            }
+
+            // make sure the stream is empty to start timing
+            cudaStreamSynchronize(stream);
+
+            // copy vectors x and y from GPU to CPU
+            cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+            cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
+            cudaStreamSynchronize(stream);
+
+            // actually apply MultisegmentWells
+            for (MultisegmentWellContribution *well : multisegments) {
+                well->apply(h_x, h_y);
+            }
+
+            // copy vector y from CPU to GPU
+            cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
+            cudaStreamSynchronize(stream);
        }

        // apply StandardWells
@ -243,6 +269,7 @@ namespace Opm
        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
        double *Cvalues)
    {
+        this->N = Nb * dim;
        MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
        multisegments.emplace_back(well);
        ++num_ms_wells;
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -63,6 +63,7 @@ namespace Opm
        unsigned int num_blocks_so_far = 0;      // keep track of where next data is written
        unsigned int num_std_wells_so_far = 0;   // keep track of where next data is written
        unsigned int *val_pointers = nullptr;    // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
+        unsigned int N;                          // number of rows (not blockrows) in vectors x and y
        bool allocated = false;
        std::vector<MultisegmentWellContribution*> multisegments;
        cudaStream_t stream;
@ -77,6 +78,9 @@ namespace Opm
        double *d_z2 = nullptr;
        unsigned int *d_val_pointers = nullptr;

+        double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
+
+
    public:

        /// StandardWell has C, D and B matrices that need to be copied
@ -98,9 +102,9 @@ namespace Opm

        /// Apply all Wells in this object
        /// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
-        /// \param[in] x          vector x
-        /// \param[inout] y       vector y
-        void apply(double *x, double *y);
+        /// \param[in] d_x        vector x, must be on GPU
+        /// \param[inout] d_y     vector y, must be on GPU
+        void apply(double *d_x, double *d_y);

        /// Allocate memory for the StandardWells
        void alloc();