remove dependency of std::thread on openmp

2025-02-25 18:55:30 -06:00 · 2024-06-27 14:06:22 +02:00 · 2024-06-27 14:06:22 +02:00 · f063f6776e
commit f063f6776e
parent 1477a4d76e
6 changed files with 70 additions and 38 deletions
--- a/opm/simulators/linalg/ISTLSolverBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverBda.cpp
@ -43,11 +43,11 @@

 #include <opm/grid/polyhedralgrid.hh>

-#if HAVE_OPENMP
 #include <thread>
-#include <omp.h>
-
 std::shared_ptr<std::thread> copyThread;
+
+#if HAVE_OPENMP
+#include <omp.h>
 #endif // HAVE_OPENMP

 namespace Opm::detail {
@ -113,9 +113,11 @@ apply(Vector& rhs,
        }
 #endif

-	bool use_multithreading = false;
+	bool use_multithreading = true;
 #if HAVE_OPENMP
-	use_multithreading = omp_get_max_threads() > 1;
+	// if user  manually sets --threads-per-process=1, do not use multithreading 
+        if (omp_get_max_threads() == 1)
+	    use_multithreading = false;
 #endif // HAVE_OPENMP

        if (numJacobiBlocks_ > 1) {
@ -123,9 +125,9 @@ apply(Vector& rhs,
 	      //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
 	      //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
 	      //the final entry in jacMat is correct.
-#if HAVE_OPENMP
+//#if HAVE_OPENMP
              copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
-#endif // HAVE_OPENMP
+//#endif // HAVE_OPENMP
 	    }
 	    else {
 	      this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
--- a/opm/simulators/linalg/bda/Misc.hpp
+++ b/opm/simulators/linalg/bda/Misc.hpp
@ -1,6 +1,7 @@
 #ifndef OPM_MISC_HPP
 #define OPM_MISC_HPP

+#ifdef HAVE_ROCSPARSE
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_version.h>
 #include <sstream>
@ -40,6 +41,7 @@
            OPM_THROW(std::logic_error, oss.str());      \
        }                                                \
    } while(0)
+#endif

 namespace Opm::Accelerator {

--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -38,10 +38,11 @@
 // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
 #define COPY_ROW_BY_ROW 0

-#if HAVE_OPENMP
 #include <thread>
-#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
+
+#if HAVE_OPENMP
+#include <omp.h>
 #endif // HAVE_OPENMP

 namespace Opm::Accelerator {
@ -342,11 +343,17 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
 #else
    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
-    if (useJacMatrix) {
+
+    bool use_multithreading = true;
 #if HAVE_OPENMP
-	if(omp_get_max_threads() > 1)
-	   copyThread->join();
+    if(omp_get_max_threads() == 1)
+        use_multithreading = false;
 #endif
+
+    if (useJacMatrix) {
+        if(use_multithreading)
+            copyThread->join();
+
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
                        cudaMemcpyHostToDevice, stream);
@ -399,12 +406,17 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
 #else
    cudaMemcpyAsync(d_bVals, matrix->nnzValues,
                    nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
-    if (useJacMatrix) {
+
+    bool use_multithreading = true;
 #if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
-           copyThread->join();
-        }
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
 #endif
+
+    if (useJacMatrix) {
+        if (use_multithreading)
+            copyThread->join();
+
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, 
                        nnzbs_prec * block_size * block_size * sizeof(Scalar),
                        cudaMemcpyHostToDevice, stream);
--- a/opm/simulators/linalg/bda/opencl/openclBILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclBILU0.cpp
@ -31,10 +31,11 @@

 #include <sstream>

-#if HAVE_OPENMP
 #include <thread>
-#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
+
+#if HAVE_OPENMP
+#include <omp.h>
 #endif //HAVE_OPENMP

 namespace Opm::Accelerator {
@ -194,13 +195,15 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
    const unsigned int bs = block_size;

    auto *matToDecompose = jacMat ? jacMat : mat;
+    bool use_multithreading = true;

-    if (jacMat) {
 #if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
-           copyThread->join();
-        }
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
 #endif
+
+    if (jacMat && use_multithreading) {
+        copyThread->join();
    }

    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@ -73,11 +73,11 @@ openclSolverBackend(int verbosity_,

    using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
    if (use_cpr) {
-        prec = openclPreconditioner<Scalar,block_size>::create(PreconditionerType::CPR,verbosity,opencl_ilu_parallel);
+        prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_parallel);
    } else if (use_isai) {
-        prec = openclPreconditioner<Scalar,block_size>::create(PreconditionerType::BISAI,verbosity,opencl_ilu_parallel);
+        prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_parallel);
    } else {
-        prec = openclPreconditioner<Scalar,block_size>::create(PreconditionerType::BILU0,verbosity,opencl_ilu_parallel);
+        prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_parallel);
    }

    std::ostringstream out;
@ -225,8 +225,10 @@ openclSolverBackend(int verbosity_,

 template<class Scalar, unsigned int block_size>
 openclSolverBackend<Scalar,block_size>::
-openclSolverBackend(int verbosity_, int maxit_,
-                    Scalar tolerance_, bool opencl_ilu_parallel_)
+openclSolverBackend(int verbosity_,
+                    int maxit_,
+                    Scalar tolerance_,
+                    bool opencl_ilu_parallel_)
    : Base(verbosity_, maxit_, tolerance_)
    , opencl_ilu_parallel(opencl_ilu_parallel_)
 {
@ -245,7 +247,8 @@ setOpencl(std::shared_ptr<cl::Context>& context_,

 template<class Scalar, unsigned int block_size>
 void openclSolverBackend<Scalar,block_size>::
-gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
+gpu_pbicgstab(WellContributions<Scalar>& wellContribs,
+              BdaResult& res)
 {
    float it;
    Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
--- a/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
+++ b/opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
@ -30,11 +30,11 @@

 #include <sstream>

+#include <thread>
+extern std::shared_ptr<std::thread> copyThread;
   
 #if HAVE_OPENMP
-#include <thread>
 #include <omp.h>
-extern std::shared_ptr<std::thread> copyThread;
 #endif //HAVE_OPENMP

 namespace Opm::Accelerator {
@ -193,13 +193,18 @@ template <class Scalar, unsigned int block_size>
 void rocsparseBILU0<Scalar, block_size>::
 copy_system_to_gpu(Scalar *d_Avals) {
    Timer t;
-    
-    if (this->useJacMatrix) {
+    bool use_multithreading = true;
+
 #if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
-           copyThread->join();
-        }
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
 #endif
+
+    if (this->useJacMatrix) {
+        if (use_multithreading) {
+            copyThread->join();
+        }
+
        HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
        HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
@ -220,13 +225,18 @@ template <class Scalar, unsigned int block_size>
 void rocsparseBILU0<Scalar, block_size>::
 update_system_on_gpu(Scalar *d_Avals) {
    Timer t;
+    bool use_multithreading = true;
+
+#if HAVE_OPENMP
+    if (omp_get_max_threads() == 1)
+        use_multithreading = false;
+#endif

    if (this->useJacMatrix) {
-#if HAVE_OPENMP
-        if (omp_get_max_threads() > 1) {
+        if (use_multithreading) {
           copyThread->join();
        }
-#endif
+
        HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
    } else {
        HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));