add support for single thread copy

2025-02-25 18:55:30 -06:00 · 2024-04-12 20:17:38 +02:00 · 2024-04-12 20:17:38 +02:00 · cc1dfca9e0
commit cc1dfca9e0
parent b0157def17
3 changed files with 26 additions and 13 deletions
--- a/opm/simulators/linalg/ISTLSolverBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverBda.cpp
@ -45,6 +45,7 @@

 #if HAVE_OPENMP
 #include <thread>
+#include <omp.h>

 std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP
@ -113,15 +114,21 @@ apply(Vector& rhs,
        }
 #endif

-        if (numJacobiBlocks_ > 1) {
+	bool use_multithreading = false;
 #if HAVE_OPENMP
+	use_multithreading = omp_get_max_threads() > 1;
+#endif // HAVE_OPENMP
+
+        if (numJacobiBlocks_ > 1) {
+            if(use_multithreading) {
 	      //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
 	      //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
 	      //the final entry in jacMat is correct.
              copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
-#else
+	    }
+	    else {
 	      this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
-#endif
+	    }

            // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
            bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@ -40,6 +40,7 @@

 #if HAVE_OPENMP
 #include <thread>
+#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP

@ -328,6 +329,7 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
+	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
@ -372,6 +374,7 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
    cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
    if (useJacMatrix) {
 #if HAVE_OPENMP
+	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
        cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@ -89,6 +89,7 @@

 #if HAVE_OPENMP
 #include <thread>
+#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
 #endif //HAVE_OPENMP

@ -441,6 +442,7 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
    
    if (useJacMatrix) {
 #if HAVE_OPENMP
+	if(omp_get_max_threads() > 1)
 	   copyThread->join();
 #endif
        HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
@ -472,6 +474,7 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
    
    if (useJacMatrix) {
 #if HAVE_OPENMP
+	if (omp_get_max_threads() > 1)
 	    copyThread->join();
 #endif
        HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));