OPT: overlap create jacMat with copy to GPU

This commit is contained in:
Razvan Nane
2024-03-15 11:25:38 +01:00
parent a7ee48c2d5
commit 411a3978b6
5 changed files with 55 additions and 20 deletions

View File

@@ -41,6 +41,8 @@
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <thread>
#include <hip/hip_runtime_api.h>
#include <hip/hip_version.h>
@@ -87,6 +89,8 @@
#include <cstddef>
extern std::shared_ptr<std::thread> copyThread;
namespace Opm
{
namespace Accelerator
@@ -431,21 +435,26 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
copyThread->join();
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
}
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
c_copy += t.stop();
std::ostringstream out;
out << "rocsparseSolver::copy_system_to_gpu(): " << t.stop() << " s";
OpmLog::info(out.str());
out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
OpmLog::info(out.str());
}
} // end copy_system_to_gpu()
@@ -455,18 +464,23 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
Timer t;
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
copyThread->join();
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
}
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
c_copy += t.stop();
std::ostringstream out;
out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s";
out << "-----rocsparseSolver::update_system_on_gpu(): " << t.elapsed() << " s\n";
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
OpmLog::info(out.str());
}
} // end update_system_on_gpu()