Now WellContributions copies vectors for MultisegmentWells to reduce number of copies

This commit is contained in:
T.D. (Tongdong) Qiu 2020-05-18 18:48:58 +02:00
parent 0f273ebdf6
commit f6036ffa8e
4 changed files with 44 additions and 35 deletions

View File

@ -19,16 +19,10 @@
#include <cstdlib>
#include <cstring>
#include <config.h> // CMake
#include <fstream>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#if HAVE_UMFPACK
#include <dune/istl/umfpack.hh>
#include <opm/simulators/linalg/bda/cuda_header.hpp>
#endif // HAVE_UMFPACK
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
@ -63,33 +57,22 @@ namespace Opm
z1.resize(Mb * dim_wells);
z2.resize(Mb * dim_wells);
// allocate pinned memory on host
cudaMallocHost(&h_x, sizeof(double) * N);
cudaMallocHost(&h_y, sizeof(double) * N);
umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
}
MultisegmentWellContribution::~MultisegmentWellContribution()
{
cudaFreeHost(h_x);
cudaFreeHost(h_y);
umfpack_di_free_symbolic(&UMFPACK_Symbolic);
umfpack_di_free_numeric(&UMFPACK_Numeric);
}
// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
// h_x and h_y reside on host
// y -= (C^T * (D^-1 * (B * x)))
void MultisegmentWellContribution::apply(double *d_x, double *d_y)
void MultisegmentWellContribution::apply(double *h_x, double *h_y)
{
// copy vectors x and y from GPU to CPU
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// reset z1 and z2
std::fill(z1.begin(), z1.end(), 0.0);
std::fill(z2.begin(), z2.end(), 0.0);
@ -128,10 +111,6 @@ namespace Opm
}
}
}
// copy vector y from CPU to GPU
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
}
void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)

View File

@ -47,8 +47,7 @@ namespace Opm
unsigned int M; // number of rows, M == dim_wells*Mb
unsigned int Mb; // number of blockrows in C, D and B
cudaStream_t stream;
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
// C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
// Sparsity pattern for C is not stored, since it is the same as B
@ -96,11 +95,11 @@ namespace Opm
/// Destroy a MultisegmentWellContribution, and free memory
~MultisegmentWellContribution();
/// Apply the MultisegmentWellContribution on GPU
/// Apply the MultisegmentWellContribution on CPU
/// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
/// \param[in] h_x vector x, must be on CPU
/// \param[inout] h_y vector y, must be on CPU
void apply(double *h_x, double *h_y);
};

View File

@ -127,7 +127,6 @@ namespace Opm
}
void WellContributions::alloc()
{
if (num_std_wells > 0) {
@ -145,6 +144,12 @@ namespace Opm
WellContributions::~WellContributions()
{
// free pinned memory for MultisegmentWellContributions
if (h_x) {
cudaFreeHost(h_x);
cudaFreeHost(h_y);
}
// delete MultisegmentWellContributions
for (auto ms : multisegments) {
delete ms;
@ -169,8 +174,29 @@ namespace Opm
void WellContributions::apply(double *d_x, double *d_y)
{
// apply MultisegmentWells
for(MultisegmentWellContribution *well : multisegments){
well->apply(d_x, d_y);
if (num_ms_wells > 0) {
// allocate pinned memory on host if not yet done
if (h_x == nullptr) {
cudaMallocHost(&h_x, sizeof(double) * N);
cudaMallocHost(&h_y, sizeof(double) * N);
}
// make sure the stream is empty to start timing
cudaStreamSynchronize(stream);
// copy vectors x and y from GPU to CPU
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// actually apply MultisegmentWells
for (MultisegmentWellContribution *well : multisegments) {
well->apply(h_x, h_y);
}
// copy vector y from CPU to GPU
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
}
// apply StandardWells
@ -243,6 +269,7 @@ namespace Opm
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
double *Cvalues)
{
this->N = Nb * dim;
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
multisegments.emplace_back(well);
++num_ms_wells;

View File

@ -63,6 +63,7 @@ namespace Opm
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
unsigned int N; // number of rows (not blockrows) in vectors x and y
bool allocated = false;
std::vector<MultisegmentWellContribution*> multisegments;
cudaStream_t stream;
@ -77,6 +78,9 @@ namespace Opm
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
public:
/// StandardWell has C, D and B matrices that need to be copied
@ -98,9 +102,9 @@ namespace Opm
/// Apply all Wells in this object
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
/// \param[in] x vector x
/// \param[inout] y vector y
void apply(double *x, double *y);
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
/// Allocate memory for the StandardWells
void alloc();