mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-01-07 23:13:01 -06:00
Now WellContributions copies vectors for MultisegmentWells to reduce number of copies
This commit is contained in:
parent
0f273ebdf6
commit
f6036ffa8e
@ -19,16 +19,10 @@
|
||||
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <config.h> // CMake
|
||||
#include <fstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#if HAVE_UMFPACK
|
||||
#include <dune/istl/umfpack.hh>
|
||||
#include <opm/simulators/linalg/bda/cuda_header.hpp>
|
||||
#endif // HAVE_UMFPACK
|
||||
|
||||
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
|
||||
@ -63,33 +57,22 @@ namespace Opm
|
||||
z1.resize(Mb * dim_wells);
|
||||
z2.resize(Mb * dim_wells);
|
||||
|
||||
// allocate pinned memory on host
|
||||
cudaMallocHost(&h_x, sizeof(double) * N);
|
||||
cudaMallocHost(&h_y, sizeof(double) * N);
|
||||
|
||||
umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
|
||||
umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
|
||||
}
|
||||
|
||||
MultisegmentWellContribution::~MultisegmentWellContribution()
|
||||
{
|
||||
cudaFreeHost(h_x);
|
||||
cudaFreeHost(h_y);
|
||||
|
||||
umfpack_di_free_symbolic(&UMFPACK_Symbolic);
|
||||
umfpack_di_free_numeric(&UMFPACK_Numeric);
|
||||
}
|
||||
|
||||
|
||||
// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
|
||||
// h_x and h_y reside on host
|
||||
// y -= (C^T * (D^-1 * (B * x)))
|
||||
void MultisegmentWellContribution::apply(double *d_x, double *d_y)
|
||||
void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
{
|
||||
// copy vectors x and y from GPU to CPU
|
||||
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
// reset z1 and z2
|
||||
std::fill(z1.begin(), z1.end(), 0.0);
|
||||
std::fill(z2.begin(), z2.end(), 0.0);
|
||||
@ -128,10 +111,6 @@ namespace Opm
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
|
||||
|
@ -47,8 +47,7 @@ namespace Opm
|
||||
unsigned int M; // number of rows, M == dim_wells*Mb
|
||||
unsigned int Mb; // number of blockrows in C, D and B
|
||||
|
||||
cudaStream_t stream;
|
||||
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
|
||||
cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
|
||||
|
||||
// C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
|
||||
// Sparsity pattern for C is not stored, since it is the same as B
|
||||
@ -96,11 +95,11 @@ namespace Opm
|
||||
/// Destroy a MultisegmentWellContribution, and free memory
|
||||
~MultisegmentWellContribution();
|
||||
|
||||
/// Apply the MultisegmentWellContribution on GPU
|
||||
/// Apply the MultisegmentWellContribution on CPU
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
|
||||
/// \param[in] d_x vector x, must be on GPU
|
||||
/// \param[inout] d_y vector y, must be on GPU
|
||||
void apply(double *d_x, double *d_y);
|
||||
/// \param[in] h_x vector x, must be on CPU
|
||||
/// \param[inout] h_y vector y, must be on CPU
|
||||
void apply(double *h_x, double *h_y);
|
||||
|
||||
};
|
||||
|
||||
|
@ -127,7 +127,6 @@ namespace Opm
|
||||
|
||||
}
|
||||
|
||||
|
||||
void WellContributions::alloc()
|
||||
{
|
||||
if (num_std_wells > 0) {
|
||||
@ -145,6 +144,12 @@ namespace Opm
|
||||
|
||||
WellContributions::~WellContributions()
|
||||
{
|
||||
// free pinned memory for MultisegmentWellContributions
|
||||
if (h_x) {
|
||||
cudaFreeHost(h_x);
|
||||
cudaFreeHost(h_y);
|
||||
}
|
||||
|
||||
// delete MultisegmentWellContributions
|
||||
for (auto ms : multisegments) {
|
||||
delete ms;
|
||||
@ -169,8 +174,29 @@ namespace Opm
|
||||
void WellContributions::apply(double *d_x, double *d_y)
|
||||
{
|
||||
// apply MultisegmentWells
|
||||
for(MultisegmentWellContribution *well : multisegments){
|
||||
well->apply(d_x, d_y);
|
||||
if (num_ms_wells > 0) {
|
||||
// allocate pinned memory on host if not yet done
|
||||
if (h_x == nullptr) {
|
||||
cudaMallocHost(&h_x, sizeof(double) * N);
|
||||
cudaMallocHost(&h_y, sizeof(double) * N);
|
||||
}
|
||||
|
||||
// make sure the stream is empty to start timing
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
// copy vectors x and y from GPU to CPU
|
||||
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (MultisegmentWellContribution *well : multisegments) {
|
||||
well->apply(h_x, h_y);
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
// apply StandardWells
|
||||
@ -243,6 +269,7 @@ namespace Opm
|
||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
double *Cvalues)
|
||||
{
|
||||
this->N = Nb * dim;
|
||||
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
|
||||
multisegments.emplace_back(well);
|
||||
++num_ms_wells;
|
||||
|
@ -63,6 +63,7 @@ namespace Opm
|
||||
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
|
||||
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
|
||||
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
|
||||
unsigned int N; // number of rows (not blockrows) in vectors x and y
|
||||
bool allocated = false;
|
||||
std::vector<MultisegmentWellContribution*> multisegments;
|
||||
cudaStream_t stream;
|
||||
@ -77,6 +78,9 @@ namespace Opm
|
||||
double *d_z2 = nullptr;
|
||||
unsigned int *d_val_pointers = nullptr;
|
||||
|
||||
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/// StandardWell has C, D and B matrices that need to be copied
|
||||
@ -98,9 +102,9 @@ namespace Opm
|
||||
|
||||
/// Apply all Wells in this object
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
|
||||
/// \param[in] x vector x
|
||||
/// \param[inout] y vector y
|
||||
void apply(double *x, double *y);
|
||||
/// \param[in] d_x vector x, must be on GPU
|
||||
/// \param[inout] d_y vector y, must be on GPU
|
||||
void apply(double *d_x, double *d_y);
|
||||
|
||||
/// Allocate memory for the StandardWells
|
||||
void alloc();
|
||||
|
Loading…
Reference in New Issue
Block a user