enable multithreaded copy only when openmp found

2025-02-25 18:55:30 -06:00 · 2024-04-03 15:29:32 +02:00
parent 411a3978b6
commit b0157def17
3 changed files with 24 additions and 5 deletions
--- a/opm/simulators/linalg/ISTLSolverBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverBda.cpp
@@ -43,9 +43,11 @@

 #include <opm/grid/polyhedralgrid.hh>

+#if HAVE_OPENMP
 #include <thread>

 std::shared_ptr<std::thread> copyThread;
+#endif // HAVE_OPENMP

 namespace Opm {
 namespace detail {
@@ -112,8 +114,15 @@ apply(Vector& rhs,
 #endif

        if (numJacobiBlocks_ > 1) {
+#if HAVE_OPENMP
+	    //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
+	    //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
+	    //the final entry in jacMat is correct.
            copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
-            
+#else
+	    this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
+#endif
+
            // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
            bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
                                  numJacobiBlocks_, rhs, *wellContribs, result);
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -30,8 +30,6 @@
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 #include <opm/simulators/linalg/bda/cuda/cuda_header.hpp>

-#include <thread>
-
 #include "cublas_v2.h"
 #include "cusparse_v2.h"
 // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
@@ -40,7 +38,10 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

+#if HAVE_OPENMP
+#include <thread>
 extern std::shared_ptr<std::thread> copyThread;
+#endif // HAVE_OPENMP

 namespace Opm
 {
@@ -326,7 +327,9 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
 #else
    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
+#if HAVE_OPENMP
        copyThread->join();
+#endif
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
    } else {
        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
@@ -368,7 +371,9 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
 #else
    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
+#if HAVE_OPENMP
        copyThread->join();
+#endif
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
    } else {
        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -41,8 +41,6 @@

 #include <opm/simulators/linalg/bda/BdaResult.hpp>

-#include <thread>
-
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_version.h>

@@ -89,7 +87,10 @@

 #include <cstddef>

+#if HAVE_OPENMP
+#include <thread>
 extern std::shared_ptr<std::thread> copyThread;
+#endif //HAVE_OPENMP

 namespace Opm
 {
@@ -439,7 +440,9 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
    HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
    
    if (useJacMatrix) {
+#if HAVE_OPENMP
        copyThread->join();
+#endif
        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
        HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
@@ -468,7 +471,9 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
    HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
    
    if (useJacMatrix) {
+#if HAVE_OPENMP
        copyThread->join();
+#endif
        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
    } else {
        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));