From cc1dfca9e0e199c2f8a097e079b9ca1f846cde80 Mon Sep 17 00:00:00 2001 From: Razvan Nane Date: Fri, 12 Apr 2024 20:17:38 +0200 Subject: [PATCH] add support for single thread copy --- opm/simulators/linalg/ISTLSolverBda.cpp | 25 ++++++++++++------- .../linalg/bda/cuda/cusparseSolverBackend.cu | 7 ++++-- .../linalg/bda/rocsparseSolverBackend.cpp | 7 ++++-- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/opm/simulators/linalg/ISTLSolverBda.cpp b/opm/simulators/linalg/ISTLSolverBda.cpp index 85124bf38..422905990 100644 --- a/opm/simulators/linalg/ISTLSolverBda.cpp +++ b/opm/simulators/linalg/ISTLSolverBda.cpp @@ -45,6 +45,7 @@ #if HAVE_OPENMP #include +#include std::shared_ptr copyThread; #endif // HAVE_OPENMP @@ -113,22 +114,28 @@ apply(Vector& rhs, } #endif - if (numJacobiBlocks_ > 1) { + bool use_multithreading = false; #if HAVE_OPENMP - //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries - //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence, - //the final entry in jacMat is correct. - copyThread = std::make_shared([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);}); -#else - this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_); -#endif + use_multithreading = omp_get_max_threads() > 1; +#endif // HAVE_OPENMP + + if (numJacobiBlocks_ > 1) { + if(use_multithreading) { + //NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries + //are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence, + //the final entry in jacMat is correct. + copyThread = std::make_shared([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);}); + } + else { + this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_); + } // Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(), numJacobiBlocks_, rhs, *wellContribs, result); } else - bridge_->solve_system(&matrix, &matrix, + bridge_->solve_system(&matrix, &matrix, numJacobiBlocks_, rhs, *wellContribs, result); if (result.converged) { // get result vector x from non-Dune backend, iff solve was successful diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu index 92d0a65d2..f38df4082 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu @@ -40,6 +40,7 @@ #if HAVE_OPENMP #include +#include extern std::shared_ptr copyThread; #endif // HAVE_OPENMP @@ -328,7 +329,8 @@ void cusparseSolverBackend::copy_system_to_gpu(std::shared_ptrnnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); if (useJacMatrix) { #if HAVE_OPENMP - copyThread->join(); + if(omp_get_max_threads() > 1) + copyThread->join(); #endif cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); } else { @@ -372,7 +374,8 @@ void cusparseSolverBackend::update_system_on_gpu(std::shared_ptrnnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); if (useJacMatrix) { #if HAVE_OPENMP - copyThread->join(); + if(omp_get_max_threads() > 1) + copyThread->join(); #endif cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); } else { diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp index ae211a948..2e1f6aa6a 100644 --- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp +++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp @@ -89,6 +89,7 @@ #if HAVE_OPENMP #include +#include extern std::shared_ptr copyThread; #endif //HAVE_OPENMP @@ -441,7 +442,8 @@ void rocsparseSolverBackend::copy_system_to_gpu(double *b) { if (useJacMatrix) { #if HAVE_OPENMP - copyThread->join(); + if(omp_get_max_threads() > 1) + copyThread->join(); #endif HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream)); @@ -472,7 +474,8 @@ void rocsparseSolverBackend::update_system_on_gpu(double *b) { if (useJacMatrix) { #if HAVE_OPENMP - copyThread->join(); + if (omp_get_max_threads() > 1) + copyThread->join(); #endif HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream)); } else {