From cc1dfca9e0e199c2f8a097e079b9ca1f846cde80 Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvan.nane@bigdataccelerate.com>
Date: Fri, 12 Apr 2024 20:17:38 +0200
Subject: [PATCH] add support for single thread copy

---
 opm/simulators/linalg/ISTLSolverBda.cpp       | 25 ++++++++++++-------
 .../linalg/bda/cuda/cusparseSolverBackend.cu  |  7 ++++--
 .../linalg/bda/rocsparseSolverBackend.cpp     |  7 ++++--
 3 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/opm/simulators/linalg/ISTLSolverBda.cpp b/opm/simulators/linalg/ISTLSolverBda.cpp
index 85124bf38..422905990 100644
--- a/opm/simulators/linalg/ISTLSolverBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverBda.cpp
@@ -45,6 +45,7 @@
 
 #if HAVE_OPENMP
 #include <thread>
+#include <omp.h>
 
 std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP
@@ -113,22 +114,28 @@ apply(Vector& rhs,
         }
 #endif
 
-        if (numJacobiBlocks_ > 1) {
+	bool use_multithreading = false;
 #if HAVE_OPENMP
-	    //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
-	    //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
-	    //the final entry in jacMat is correct.
-            copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
-#else
-	    this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
-#endif
+	use_multithreading = omp_get_max_threads() > 1;
+#endif // HAVE_OPENMP
+
+        if (numJacobiBlocks_ > 1) {
+            if(use_multithreading) {
+	      //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
+	      //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
+	      //the final entry in jacMat is correct.
+              copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
+	    }
+	    else {
+	      this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
+	    }
 
             // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
             bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
                                   numJacobiBlocks_, rhs, *wellContribs, result);
         }
         else
-            bridge_->solve_system(&matrix, &matrix,
+          bridge_->solve_system(&matrix, &matrix,
                                   numJacobiBlocks_, rhs, *wellContribs, result);
         if (result.converged) {
             // get result vector x from non-Dune backend, iff solve was successful
diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
index 92d0a65d2..f38df4082 100644
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -40,6 +40,7 @@
 
 #if HAVE_OPENMP
 #include <thread>
+#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
 #endif // HAVE_OPENMP
 
@@ -328,7 +329,8 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
     cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
     if (useJacMatrix) {
 #if HAVE_OPENMP
-        copyThread->join();
+	if(omp_get_max_threads() > 1)
+	   copyThread->join();
 #endif
         cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
     } else {
@@ -372,7 +374,8 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
     cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
     if (useJacMatrix) {
 #if HAVE_OPENMP
-        copyThread->join();
+	if(omp_get_max_threads() > 1)
+	   copyThread->join();
 #endif
         cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
     } else {
diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
index ae211a948..2e1f6aa6a 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -89,6 +89,7 @@
 
 #if HAVE_OPENMP
 #include <thread>
+#include <omp.h>
 extern std::shared_ptr<std::thread> copyThread;
 #endif //HAVE_OPENMP
 
@@ -441,7 +442,8 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
     
     if (useJacMatrix) {
 #if HAVE_OPENMP
-        copyThread->join();
+	if(omp_get_max_threads() > 1)
+	   copyThread->join();
 #endif
         HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
         HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
@@ -472,7 +474,8 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
     
     if (useJacMatrix) {
 #if HAVE_OPENMP
-        copyThread->join();
+	if (omp_get_max_threads() > 1)
+	    copyThread->join();
 #endif
         HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
     } else {