mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-02 09:50:30 -06:00
Merge pull request #5256 from BigDataAccelerate/overlap_copy_jacmat_gpu
Overlap creation of jacobian matrix with GPU data transfers
This commit is contained in:
commit
6f094c558a
@ -43,6 +43,13 @@
|
||||
|
||||
#include <opm/grid/polyhedralgrid.hh>
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <thread>
|
||||
#include <omp.h>
|
||||
|
||||
std::shared_ptr<std::thread> copyThread;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm {
|
||||
namespace detail {
|
||||
|
||||
@ -107,14 +114,28 @@ apply(Vector& rhs,
|
||||
}
|
||||
#endif
|
||||
|
||||
bool use_multithreading = false;
|
||||
#if HAVE_OPENMP
|
||||
use_multithreading = omp_get_max_threads() > 1;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
if (numJacobiBlocks_ > 1) {
|
||||
this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
|
||||
if(use_multithreading) {
|
||||
//NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
|
||||
//are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
|
||||
//the final entry in jacMat is correct.
|
||||
copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
|
||||
}
|
||||
else {
|
||||
this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
|
||||
}
|
||||
|
||||
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
|
||||
bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
|
||||
numJacobiBlocks_, rhs, *wellContribs, result);
|
||||
}
|
||||
else
|
||||
bridge_->solve_system(&matrix, &matrix,
|
||||
bridge_->solve_system(&matrix, &matrix,
|
||||
numJacobiBlocks_, rhs, *wellContribs, result);
|
||||
if (result.converged) {
|
||||
// get result vector x from non-Dune backend, iff solve was successful
|
||||
|
@ -38,6 +38,12 @@
|
||||
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||
#define COPY_ROW_BY_ROW 0
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <thread>
|
||||
#include <omp.h>
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
@ -306,6 +312,11 @@ template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
@ -317,25 +328,27 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
#endif
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
if (verbosity >= 3) {
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
c_copy += t.stop();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::copy_system_to_gpu(): " << t.stop() << " s";
|
||||
out << "---cusparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
@ -346,6 +359,9 @@ template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
@ -357,19 +373,23 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
#endif
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
if (verbosity >= 3) {
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
c_copy += t.stop();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::update_system_on_gpu(): " << t.stop() << " s";
|
||||
out << "-----cusparseSolver::update_system_on_gpu(): " << t.elapsed() << " s\n";
|
||||
out << "---cusparseSolver::cum copy: " << c_copy << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
@ -71,6 +71,8 @@ private:
|
||||
bool useJacMatrix = false;
|
||||
int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner
|
||||
// could be jacMatrix or matrix
|
||||
|
||||
double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
|
@ -87,6 +87,12 @@
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <thread>
|
||||
#include <omp.h>
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
#endif //HAVE_OPENMP
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
@ -431,21 +437,29 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
#endif
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
c_copy += t.stop();
|
||||
std::ostringstream out;
|
||||
out << "rocsparseSolver::copy_system_to_gpu(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
|
||||
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
@ -455,18 +469,26 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
|
||||
Timer t;
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
#endif
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
c_copy += t.stop();
|
||||
std::ostringstream out;
|
||||
out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s";
|
||||
out << "-----rocsparseSolver::update_system_on_gpu(): " << t.elapsed() << " s\n";
|
||||
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
@ -55,6 +55,8 @@ class rocsparseSolverBackend : public BdaSolver<block_size>
|
||||
|
||||
private:
|
||||
|
||||
double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU
|
||||
|
||||
bool useJacMatrix = false;
|
||||
|
||||
bool analysis_done = false;
|
||||
|
Loading…
Reference in New Issue
Block a user