remove dependency of std::thread on openmp

This commit is contained in:
Razvan Nane 2024-06-27 14:06:22 +02:00
parent 1477a4d76e
commit f063f6776e
6 changed files with 70 additions and 38 deletions

View File

@ -43,11 +43,11 @@
#include <opm/grid/polyhedralgrid.hh>
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
std::shared_ptr<std::thread> copyThread;
#if HAVE_OPENMP
#include <omp.h>
#endif // HAVE_OPENMP
namespace Opm::detail {
@ -113,9 +113,11 @@ apply(Vector& rhs,
}
#endif
bool use_multithreading = false;
bool use_multithreading = true;
#if HAVE_OPENMP
use_multithreading = omp_get_max_threads() > 1;
// if user manually sets --threads-per-process=1, do not use multithreading
if (omp_get_max_threads() == 1)
use_multithreading = false;
#endif // HAVE_OPENMP
if (numJacobiBlocks_ > 1) {
@ -123,9 +125,9 @@ apply(Vector& rhs,
//NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
//are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
//the final entry in jacMat is correct.
#if HAVE_OPENMP
//#if HAVE_OPENMP
copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
#endif // HAVE_OPENMP
//#endif // HAVE_OPENMP
}
else {
this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);

View File

@ -1,6 +1,7 @@
#ifndef OPM_MISC_HPP
#define OPM_MISC_HPP
#ifdef HAVE_ROCSPARSE
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
#include <sstream>
@ -40,6 +41,7 @@
OPM_THROW(std::logic_error, oss.str()); \
} \
} while(0)
#endif
namespace Opm::Accelerator {

View File

@ -38,10 +38,11 @@
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
extern std::shared_ptr<std::thread> copyThread;
#if HAVE_OPENMP
#include <omp.h>
#endif // HAVE_OPENMP
namespace Opm::Accelerator {
@ -342,11 +343,17 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
#else
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
bool use_multithreading = true;
#if HAVE_OPENMP
if(omp_get_max_threads() > 1)
copyThread->join();
if(omp_get_max_threads() == 1)
use_multithreading = false;
#endif
if (useJacMatrix) {
if(use_multithreading)
copyThread->join();
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
@ -399,12 +406,17 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
#else
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
bool use_multithreading = true;
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
if (omp_get_max_threads() == 1)
use_multithreading = false;
#endif
if (useJacMatrix) {
if (use_multithreading)
copyThread->join();
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);

View File

@ -31,10 +31,11 @@
#include <sstream>
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
extern std::shared_ptr<std::thread> copyThread;
#if HAVE_OPENMP
#include <omp.h>
#endif //HAVE_OPENMP
namespace Opm::Accelerator {
@ -194,13 +195,15 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
const unsigned int bs = block_size;
auto *matToDecompose = jacMat ? jacMat : mat;
bool use_multithreading = true;
if (jacMat) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
if (omp_get_max_threads() == 1)
use_multithreading = false;
#endif
if (jacMat && use_multithreading) {
copyThread->join();
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp

View File

@ -73,11 +73,11 @@ openclSolverBackend(int verbosity_,
using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
if (use_cpr) {
prec = openclPreconditioner<Scalar,block_size>::create(PreconditionerType::CPR,verbosity,opencl_ilu_parallel);
prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_parallel);
} else if (use_isai) {
prec = openclPreconditioner<Scalar,block_size>::create(PreconditionerType::BISAI,verbosity,opencl_ilu_parallel);
prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_parallel);
} else {
prec = openclPreconditioner<Scalar,block_size>::create(PreconditionerType::BILU0,verbosity,opencl_ilu_parallel);
prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_parallel);
}
std::ostringstream out;
@ -225,8 +225,10 @@ openclSolverBackend(int verbosity_,
template<class Scalar, unsigned int block_size>
openclSolverBackend<Scalar,block_size>::
openclSolverBackend(int verbosity_, int maxit_,
Scalar tolerance_, bool opencl_ilu_parallel_)
openclSolverBackend(int verbosity_,
int maxit_,
Scalar tolerance_,
bool opencl_ilu_parallel_)
: Base(verbosity_, maxit_, tolerance_)
, opencl_ilu_parallel(opencl_ilu_parallel_)
{
@ -245,7 +247,8 @@ setOpencl(std::shared_ptr<cl::Context>& context_,
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
gpu_pbicgstab(WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
float it;
Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;

View File

@ -30,11 +30,11 @@
#include <sstream>
#include <thread>
extern std::shared_ptr<std::thread> copyThread;
#if HAVE_OPENMP
#include <thread>
#include <omp.h>
extern std::shared_ptr<std::thread> copyThread;
#endif //HAVE_OPENMP
namespace Opm::Accelerator {
@ -193,13 +193,18 @@ template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
copy_system_to_gpu(Scalar *d_Avals) {
Timer t;
if (this->useJacMatrix) {
bool use_multithreading = true;
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
copyThread->join();
}
if (omp_get_max_threads() == 1)
use_multithreading = false;
#endif
if (this->useJacMatrix) {
if (use_multithreading) {
copyThread->join();
}
HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
@ -220,13 +225,18 @@ template <class Scalar, unsigned int block_size>
void rocsparseBILU0<Scalar, block_size>::
update_system_on_gpu(Scalar *d_Avals) {
Timer t;
bool use_multithreading = true;
#if HAVE_OPENMP
if (omp_get_max_threads() == 1)
use_multithreading = false;
#endif
if (this->useJacMatrix) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1) {
if (use_multithreading) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));