Split WellContributions into .cpp and .cu

2025-02-25 18:55:30 -06:00 · 2020-07-10 11:13:55 +02:00 · 2020-07-10 11:13:55 +02:00 · 5971a7ae9e
commit 5971a7ae9e
parent f90bb85960
5 changed files with 183 additions and 125 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -42,6 +42,7 @@ list (APPEND MAIN_SOURCE_FILES

 if(CUDA_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
@ -53,7 +54,7 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
 endif()
 if(MPI_FOUND)
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -332,7 +332,7 @@ protected:
 #else
            const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
            if (gpu_mode.compare("none") != 0) {
-                OPM_THROW(std::logic_error,"Error cannot use GPU solver since CUDA was not found during compilation");
+                OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL was not found by cmake");
            }
 #endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -0,0 +1,150 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <config.h> // CMake
+#include <cstdlib>
+#include <cstring>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include "opm/simulators/linalg/bda/WellContributions.hpp"
+
+namespace Opm
+{
+
+
+void WellContributions::alloc()
+{
+    if (num_std_wells > 0) {
+#if HAVE_CUDA
+        allocStandardWells();
+#else
+        OPM_THROW(std::logic_error, "Error cannot allocate on GPU for StandardWells because CUDA was not found by cmake");
+#endif
+    }
+}
+
+WellContributions::~WellContributions()
+{
+    if (h_x) {
+        delete[] h_x;
+        delete[] h_y;
+    }
+
+    // delete MultisegmentWellContributions
+    for (auto ms : multisegments) {
+        delete ms;
+    }
+    multisegments.clear();
+
+#if HAVE_CUDA
+    freeStandardWells();
+#endif
+}
+
+
+#if HAVE_OPENCL
+void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
+
+    // apply MultisegmentWells
+    if (num_ms_wells > 0) {
+        // allocate pinned memory on host if not yet done
+        if (h_x == nullptr) {
+            h_x = new double[N];
+            h_y = new double[N];
+        }
+
+        // copy vectors x and y from GPU to CPU
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
+        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
+
+        // actually apply MultisegmentWells
+        for (MultisegmentWellContribution *well : multisegments) {
+            well->apply(h_x, h_y);
+        }
+
+        // copy vector y from CPU to GPU
+        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
+    }
+
+
+    // apply StandardWells
+    if (num_std_wells > 0) {
+        OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
+    }
+}
+#endif
+
+void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+{
+#if HAVE_CUDA
+        addMatrixGpu(type, colIndices, values, val_size);
+#else
+        OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because CUDA was not found by cmake");
+#endif
+}
+
+
+void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
+{
+    dim = dim_;
+    dim_wells = dim_wells_;
+}
+
+void WellContributions::addNumBlocks(unsigned int nnz)
+{
+    if (allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
+    }
+    num_blocks += nnz;
+    num_std_wells++;
+}
+
+void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
+        unsigned int Nb, unsigned int Mb,
+        unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
+        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
+        std::vector<double> &Cvalues)
+{
+    this->N = Nb * dim;
+    MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
+    multisegments.emplace_back(well);
+    ++num_ms_wells;
+}
+
+
+void WellContributions::setReordering(int *toOrder_, bool reorder_)
+{
+    this->toOrder = toOrder_;
+    this->reorder = reorder_;
+    for (auto& ms : multisegments) {
+        ms->setReordering(toOrder_, reorder_);
+    }
+}
+
+#if HAVE_OPENCL
+void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_)
+{
+    this->queue = queue_;
+}
+#endif
+
+} //namespace Opm
+
--- a/opm/simulators/linalg/bda/WellContributions.cu
+++ b/opm/simulators/linalg/bda/WellContributions.cu
@ -22,10 +22,8 @@
 #include <cstdlib>
 #include <cstring>

-#if HAVE_CUDA
 #include "opm/simulators/linalg/bda/cuda_header.hpp"
 #include <cuda_runtime.h>
-#endif

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
@ -35,7 +33,6 @@ namespace Opm
 {

 // apply WellContributions using y -= C^T * (D^-1 * (B * x))
-#if HAVE_CUDA
 __global__ void apply_well_contributions(
    const double * __restrict__ Cnnzs,
    const double * __restrict__ Dnnzs,
@ -127,12 +124,10 @@ __global__ void apply_well_contributions(
    }

 }
-#endif

-void WellContributions::alloc()
+void WellContributions::allocStandardWells()
 {
    if (num_std_wells > 0) {
-#if HAVE_CUDA
        cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
        cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
        cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
@ -142,34 +137,12 @@ void WellContributions::alloc()
        cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
        cudaCheckLastError("apply_gpu malloc failed");
        allocated = true;
-#endif
    }
 }

-WellContributions::~WellContributions()
-{
-    // free pinned memory for MultisegmentWellContributions
-    if (h_x) {
-        if (host_mem_cuda) {
-#if HAVE_CUDA
-            cudaFreeHost(h_x);
-            cudaFreeHost(h_y);
-#endif
-        } else {
-            delete[] h_x;
-            delete[] h_y;
-        }
-    }
-
-    // delete MultisegmentWellContributions
-    for (auto ms : multisegments) {
-        delete ms;
-    }
-    multisegments.clear();
-
+void WellContributions::freeStandardWells() {
    // delete data for StandardWell
    if (num_std_wells > 0) {
-#if HAVE_CUDA
        cudaFree(d_Cnnzs);
        cudaFree(d_Dnnzs);
        cudaFree(d_Bnnzs);
@ -177,14 +150,12 @@ WellContributions::~WellContributions()
        cudaFree(d_Bcols);
        delete[] val_pointers;
        cudaFree(d_val_pointers);
-#endif
    }
 }


 // Apply the WellContributions, similar to StandardWell::apply()
 // y -= (C^T *(D^-1*(   B*x)))
-#if HAVE_CUDA
 void WellContributions::apply(double *d_x, double *d_y)
 {
    // apply MultisegmentWells
@ -221,48 +192,14 @@ void WellContributions::apply(double *d_x, double *d_y)
        apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
    }
 }
-#endif
-
-#if HAVE_OPENCL
-void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
-
-    // apply MultisegmentWells
-    if (num_ms_wells > 0) {
-        // allocate pinned memory on host if not yet done
-        if (h_x == nullptr) {
-            h_x = new double[N];
-            h_y = new double[N];
-        }
-
-        // copy vectors x and y from GPU to CPU
-        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
-        queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
-
-        // actually apply MultisegmentWells
-        for (MultisegmentWellContribution *well : multisegments) {
-            well->apply(h_x, h_y);
-        }
-
-        // copy vector y from CPU to GPU
-        queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
-    }


-    // apply StandardWells
-    if (num_std_wells > 0) {
-        OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
-    }
-}
-#endif
-
-
-void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
+void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
 {
    if (!allocated) {
        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
    }

-#if HAVE_CUDA
    switch (type) {
    case MatrixType::C:
        cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
@ -288,12 +225,8 @@ void WellContributions::addMatrix(MatrixType type, int *colIndices, double *valu
        num_blocks_so_far += val_size;
        num_std_wells_so_far++;
    }
-#else
-    OPM_THROW(std::logic_error, "Error cannot use StandardWells since CUDA was not found");
-#endif
 }

-#if HAVE_CUDA
 void WellContributions::setCudaStream(cudaStream_t stream_)
 {
    this->stream = stream_;
@ -301,51 +234,6 @@ void WellContributions::setCudaStream(cudaStream_t stream_)
        well->setCudaStream(stream_);
    }
 }
-#endif
-
-void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
-{
-    dim = dim_;
-    dim_wells = dim_wells_;
-}
-
-void WellContributions::addNumBlocks(unsigned int nnz)
-{
-    if (allocated) {
-        OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
-    }
-    num_blocks += nnz;
-    num_std_wells++;
-}
-
-void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
-        unsigned int Nb, unsigned int Mb,
-        unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
-        unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
-        std::vector<double> &Cvalues)
-{
-    this->N = Nb * dim;
-    MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
-    multisegments.emplace_back(well);
-    ++num_ms_wells;
-}
-
-
-void WellContributions::setReordering(int *toOrder_, bool reorder_)
-{
-    this->toOrder = toOrder_;
-    this->reorder = reorder_;
-    for (auto& ms : multisegments) {
-        ms->setReordering(toOrder_, reorder_);
-    }
-}
-
-#if HAVE_OPENCL
-void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_)
-{
-    this->queue = queue_;
-}
-#endif

 } //namespace Opm

--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -39,7 +39,8 @@ namespace Opm
 /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
 /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
 /// So far, StandardWell and MultisegmentWell are supported
-/// A single instance (or pointer) of this class is passed to the cusparseSolver.
+/// StandardWells are only supported for cusparseSolver (CUDA), MultisegmentWells are supported for both cusparseSolver and openclSolver
+/// A single instance (or pointer) of this class is passed to the BdaSolver.
 /// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.

 /// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
@ -55,6 +56,15 @@ namespace Opm
 class WellContributions
 {

+public:
+
+    /// StandardWell has C, D and B matrices that need to be copied
+    enum class MatrixType {
+        C,
+        D,
+        B
+    };
+
 private:
    unsigned int num_blocks = 0;             // total number of blocks in all wells
    unsigned int dim;                        // number of columns in blocks in B and C, equal to StandardWell::numEq
@ -76,7 +86,9 @@ private:
    cl::CommandQueue *queue = nullptr;
 #endif

+#if HAVE_CUDA
    // data for StandardWells, could remain nullptrs if not used
+    // StandardWells are only supported for cusparseSolver now
    double *d_Cnnzs = nullptr;
    double *d_Dnnzs = nullptr;
    double *d_Bnnzs = nullptr;
@ -85,6 +97,7 @@ private:
    double *d_z1 = nullptr;
    double *d_z2 = nullptr;
    unsigned int *d_val_pointers = nullptr;
+#endif

    double *h_x = nullptr, *h_y = nullptr;  // CUDA pinned memory for GPU memcpy
    bool host_mem_cuda = false;             // true iff h_x and h_y are allocated by cudaMallocHost(), so they need to be freed using cudaFreeHost()
@ -92,14 +105,20 @@ private:
    int *toOrder = nullptr;
    bool reorder = false;

-public:
+    /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
+    /// \param[in] type        indicate if C, D or B is sent
+    /// \param[in] colIndices  columnindices of blocks in C or B, ignored for D
+    /// \param[in] values      array of nonzeroes
+    /// \param[in] val_size    number of blocks in C or B, ignored for D
+    void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);

-    /// StandardWell has C, D and B matrices that need to be copied
-    enum class MatrixType {
-        C,
-        D,
-        B
-    };
+    /// Allocate GPU memory for StandardWells
+    void allocStandardWells();
+
+    /// Free GPU memory from StandardWells
+    void freeStandardWells();
+
+public:

    /// Set a cudaStream to be used
    /// \param[in] stream           the cudaStream that is used to launch the kernel in