OPT: overlap create jacMat with copy to GPU

This commit is contained in:
Razvan Nane 2024-03-15 11:25:38 +01:00
parent a7ee48c2d5
commit 411a3978b6
5 changed files with 55 additions and 20 deletions

View File

@ -43,6 +43,10 @@
#include <opm/grid/polyhedralgrid.hh> #include <opm/grid/polyhedralgrid.hh>
#include <thread>
std::shared_ptr<std::thread> copyThread;
namespace Opm { namespace Opm {
namespace detail { namespace detail {
@ -108,7 +112,8 @@ apply(Vector& rhs,
#endif #endif
if (numJacobiBlocks_ > 1) { if (numJacobiBlocks_ > 1) {
this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_); copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(), bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
numJacobiBlocks_, rhs, *wellContribs, result); numJacobiBlocks_, rhs, *wellContribs, result);

View File

@ -30,6 +30,8 @@
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/cuda/cuda_header.hpp> #include <opm/simulators/linalg/bda/cuda/cuda_header.hpp>
#include <thread>
#include "cublas_v2.h" #include "cublas_v2.h"
#include "cusparse_v2.h" #include "cusparse_v2.h"
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
@ -38,6 +40,8 @@
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough // otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0 #define COPY_ROW_BY_ROW 0
extern std::shared_ptr<std::thread> copyThread;
namespace Opm namespace Opm
{ {
namespace Accelerator namespace Accelerator
@ -306,6 +310,11 @@ template <unsigned int block_size>
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) { void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
Timer t; Timer t;
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
for (int i = 0; i < Nb; ++i) { for (int i = 0; i < Nb; ++i) {
@ -317,25 +326,24 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
#else #else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) { if (useJacMatrix) {
copyThread->join();
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
} else { } else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
} }
#endif #endif
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) { if (useJacMatrix) {
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
} }
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
if (verbosity > 2) { if (verbosity >= 3) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
c_copy += t.stop();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::copy_system_to_gpu(): " << t.stop() << " s"; out << "---cusparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end copy_system_to_gpu() } // end copy_system_to_gpu()
@ -346,6 +354,9 @@ template <unsigned int block_size>
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) { void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
Timer t; Timer t;
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
for (int i = 0; i < Nb; ++i) { for (int i = 0; i < Nb; ++i) {
@ -357,19 +368,20 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
#else #else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) { if (useJacMatrix) {
copyThread->join();
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
} else { } else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
} }
#endif #endif
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); if (verbosity >= 3) {
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
if (verbosity > 2) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
c_copy += t.stop();
std::ostringstream out; std::ostringstream out;
out << "cusparseSolver::update_system_on_gpu(): " << t.stop() << " s"; out << "-----cusparseSolver::update_system_on_gpu(): " << t.elapsed() << " s\n";
out << "---cusparseSolver::cum copy: " << c_copy << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end update_system_on_gpu() } // end update_system_on_gpu()

View File

@ -71,6 +71,8 @@ private:
bool useJacMatrix = false; bool useJacMatrix = false;
int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner
// could be jacMatrix or matrix // could be jacMatrix or matrix
double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU
/// Solve linear system using ilu0-bicgstab /// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A

View File

@ -41,6 +41,8 @@
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <thread>
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#include <hip/hip_version.h> #include <hip/hip_version.h>
@ -87,6 +89,8 @@
#include <cstddef> #include <cstddef>
extern std::shared_ptr<std::thread> copyThread;
namespace Opm namespace Opm
{ {
namespace Accelerator namespace Accelerator
@ -431,21 +435,26 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (useJacMatrix) { if (useJacMatrix) {
copyThread->join();
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
} else { } else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
} }
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (verbosity >= 3) { if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamSynchronize(stream));
c_copy += t.stop();
std::ostringstream out; std::ostringstream out;
out << "rocsparseSolver::copy_system_to_gpu(): " << t.stop() << " s"; out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
OpmLog::info(out.str()); out << "---rocsparseSolver::cum copy: " << c_copy << " s";
OpmLog::info(out.str());
} }
} // end copy_system_to_gpu() } // end copy_system_to_gpu()
@ -455,18 +464,23 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
Timer t; Timer t;
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (useJacMatrix) { if (useJacMatrix) {
copyThread->join();
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
} else { } else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
} }
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (verbosity >= 3) { if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamSynchronize(stream));
c_copy += t.stop();
std::ostringstream out; std::ostringstream out;
out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s"; out << "-----rocsparseSolver::update_system_on_gpu(): " << t.elapsed() << " s\n";
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} // end update_system_on_gpu() } // end update_system_on_gpu()

View File

@ -55,6 +55,8 @@ class rocsparseSolverBackend : public BdaSolver<block_size>
private: private:
double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU
bool useJacMatrix = false; bool useJacMatrix = false;
bool analysis_done = false; bool analysis_done = false;