mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Add classes handling correct MPI implementation
Make some changes to Georgs original code: dynamically allocated arrays with std::vectors instead Implement new class structure handling what MPI communication implementation to use create extra scopes to avoid reuse of index variable i Update related tests: Update test_cuowneroverlapcopy to account for new class strucutre Also remove line that invalidates the MPI tests for multiple processes
This commit is contained in:
parent
eb6f9dc1f9
commit
7235f34f0e
@ -22,32 +22,46 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace Opm::cuistl
|
||||
{
|
||||
/**
|
||||
* @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
|
||||
*
|
||||
* This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
|
||||
* a Dune::OwnerOverlapCopyCommunication on CuVectors
|
||||
*
|
||||
*
|
||||
* @note This currently only has the functionality to parallelize the linear solve.
|
||||
*
|
||||
* @tparam field_type should be a field_type supported by CuVector (double, float)
|
||||
* @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
|
||||
* @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
|
||||
* @brief GPUSender is a wrapper class for classes which will implement copOwnerToAll
|
||||
* This is implemented with the intention of creating communicators with generic GPUSender
|
||||
* To hide implementation that will either use GPU aware MPI or not
|
||||
* @tparam field_type is float or double
|
||||
* @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
|
||||
*/
|
||||
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
|
||||
class CuOwnerOverlapCopy
|
||||
{
|
||||
template<class field_type, class OwnerOverlapCopyCommunicationType>
|
||||
class GPUSender {
|
||||
public:
|
||||
using X = CuVector<field_type>;
|
||||
|
||||
CuOwnerOverlapCopy(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
|
||||
: m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy)
|
||||
GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}
|
||||
|
||||
/**
|
||||
* @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
|
||||
* the copied data, and copy the result back to the GPU
|
||||
* @param[in] source
|
||||
* @param[out] dest
|
||||
*/
|
||||
virtual void copyOwnerToAll(const X& source, X& dest) const = 0;
|
||||
virtual void initIndexSet() const = 0;
|
||||
|
||||
/**
|
||||
* @brief project will project x to the owned subspace
|
||||
*
|
||||
* For each component i which is not owned, x_i will be set to 0
|
||||
*
|
||||
* @param[inout] x the vector to project
|
||||
*/
|
||||
void project(X& x) const
|
||||
{
|
||||
std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
|
||||
x.setZeroAtIndexSet(*m_indicesCopy);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief dot will carry out the dot product between x and y on the owned indices, then sum up the result across MPI
|
||||
* processes.
|
||||
@ -74,247 +88,53 @@ public:
|
||||
auto xDotX = field_type(0);
|
||||
this->dot(x, x, xDotX);
|
||||
|
||||
using std::sqrt;
|
||||
return sqrt(xDotX);
|
||||
// using std::sqrt;
|
||||
return std::sqrt(xDotX);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief project will project x to the owned subspace
|
||||
*
|
||||
* For each component i which is not owned, x_i will be set to 0
|
||||
*
|
||||
* @param[inout] x the vector to project
|
||||
*/
|
||||
void project(X& x) const
|
||||
{
|
||||
std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
|
||||
x.setZeroAtIndexSet(*m_indicesCopy);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
|
||||
* the copied data, and copy the result back to the GPU
|
||||
* @param[in] source
|
||||
* @param[out] dest
|
||||
*/
|
||||
void copyOwnerToAll_orig(const X& source, X& dest) const
|
||||
{
|
||||
// TODO: [perf] Can we reduce copying from the GPU here?
|
||||
// TODO: [perf] Maybe create a global buffer instead?
|
||||
auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
|
||||
auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
|
||||
m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
|
||||
dest.copyFromHost(destAsDuneVector);
|
||||
}
|
||||
|
||||
// Georgs new code intended to use GPU direct
|
||||
void copyOwnerToAll(const X& source, X& dest) const
|
||||
{
|
||||
|
||||
printf("\n\nGPU DIRECT CODE IS RUN\n\n");
|
||||
printf("Compile time check:\n");
|
||||
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
|
||||
printf("This MPI library has CUDA-aware support.\n", MPIX_CUDA_AWARE_SUPPORT);
|
||||
#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
|
||||
printf("This MPI library does not have CUDA-aware support.\n");
|
||||
#else
|
||||
printf("This MPI library cannot determine if there is CUDA-aware support.\n");
|
||||
#endif /* MPIX_CUDA_AWARE_SUPPORT */
|
||||
|
||||
printf("Run time check:\n");
|
||||
#if defined(MPIX_CUDA_AWARE_SUPPORT)
|
||||
if (1 == MPIX_Query_cuda_support()) {
|
||||
printf("This MPI library has CUDA-aware support.\n");
|
||||
} else {
|
||||
printf("This MPI library does not have CUDA-aware support.\n");
|
||||
}
|
||||
#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
|
||||
printf("This MPI library cannot determine if there is CUDA-aware support.\n");
|
||||
#endif /* MPIX_CUDA_AWARE_SUPPORT */
|
||||
|
||||
|
||||
assert(&source == &dest); // In this context, source == dest!!!
|
||||
std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
|
||||
|
||||
int rank;
|
||||
MPI_Comm_rank(m_cpuOwnerOverlapCopy.communicator(), &rank);
|
||||
dest.prepareSendBuf(*m_GPUSendBuf, *m_commpair_indicesOwner);
|
||||
|
||||
// Start MPI stuff here...
|
||||
// Note: This has been taken from DUNE's parallel/communicator.hh
|
||||
MPI_Request* sendRequests = new MPI_Request[messageInformation_.size()];
|
||||
MPI_Request* recvRequests = new MPI_Request[messageInformation_.size()];
|
||||
size_t numberOfRealRecvRequests = 0;
|
||||
|
||||
typedef typename InformationMap::const_iterator const_iterator;
|
||||
const const_iterator end = messageInformation_.end();
|
||||
size_t i=0;
|
||||
int* processMap = new int[messageInformation_.size()];
|
||||
for(const_iterator info = messageInformation_.begin(); info != end; ++info, ++i) {
|
||||
processMap[i]=info->first;
|
||||
if(info->second.second.size_) {
|
||||
MPI_Irecv(m_GPURecvBuf->data()+info->second.second.start_,
|
||||
info->second.second.size_,
|
||||
MPI_BYTE,
|
||||
info->first,
|
||||
commTag_,
|
||||
m_cpuOwnerOverlapCopy.communicator(),
|
||||
&recvRequests[i]);
|
||||
numberOfRealRecvRequests += 1;
|
||||
} else {
|
||||
recvRequests[i]=MPI_REQUEST_NULL;
|
||||
}
|
||||
}
|
||||
|
||||
i=0;
|
||||
for(const_iterator info = messageInformation_.begin(); info != end; ++info, ++i) {
|
||||
if(info->second.first.size_) {
|
||||
MPI_Issend(m_GPUSendBuf->data()+info->second.first.start_,
|
||||
info->second.first.size_,
|
||||
MPI_BYTE,
|
||||
info->first,
|
||||
commTag_,
|
||||
m_cpuOwnerOverlapCopy.communicator(),
|
||||
&sendRequests[i]);
|
||||
} else {
|
||||
sendRequests[i]=MPI_REQUEST_NULL;
|
||||
}
|
||||
}
|
||||
i=0;
|
||||
int finished = MPI_UNDEFINED;
|
||||
MPI_Status status;
|
||||
for(i=0; i< numberOfRealRecvRequests; i++) {
|
||||
status.MPI_ERROR=MPI_SUCCESS;
|
||||
MPI_Waitany(messageInformation_.size(), recvRequests, &finished, &status);
|
||||
|
||||
if(status.MPI_ERROR!=MPI_SUCCESS) {
|
||||
std::cerr<< rank << ": MPI_Error occurred while receiving message from "<< processMap[finished] << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
MPI_Status recvStatus;
|
||||
for(i=0; i< messageInformation_.size(); i++) {
|
||||
if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
|
||||
std::cerr << rank << ": MPI_Error occurred while sending message to " << processMap[finished] << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
delete[] processMap;
|
||||
delete[] sendRequests;
|
||||
delete[] recvRequests;
|
||||
// ...End of MPI stuff
|
||||
|
||||
dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpair_indicesCopy);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private:
|
||||
const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
|
||||
|
||||
protected:
|
||||
// Used to call the initIndexSet. Note that this is kind of a
|
||||
// premature optimization, in the sense that we could just initialize these indices
|
||||
// always, but they are not always used.
|
||||
mutable std::once_flag m_initializedIndices;
|
||||
mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
|
||||
mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
|
||||
|
||||
mutable std::unique_ptr<CuVector<int>> m_commpair_indicesCopy;
|
||||
mutable std::unique_ptr<CuVector<int>> m_commpair_indicesOwner;
|
||||
mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
|
||||
mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
|
||||
|
||||
struct MessageInformation
|
||||
{
|
||||
MessageInformation() : start_(0), size_(0) {}
|
||||
MessageInformation(size_t start, size_t size) : start_(start), size_(size) {}
|
||||
size_t start_; // offset in elements of "field_type"
|
||||
size_t size_; // size in bytes
|
||||
mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
|
||||
const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
|
||||
};
|
||||
|
||||
typedef std::map<int,std::pair<MessageInformation,MessageInformation> > InformationMap;
|
||||
mutable InformationMap messageInformation_;
|
||||
typedef std::map<int,std::pair<std::vector<int>,std::vector<int> > > IM;
|
||||
mutable IM m_im;
|
||||
|
||||
constexpr static int commTag_ = 0; // So says DUNE
|
||||
|
||||
void buildCommPairIdxs() const
|
||||
/**
|
||||
* @brief Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin
|
||||
* The implementation moves data fromthe GPU to the CPU and then sends it using regular MPI
|
||||
* @tparam field_type is float or double
|
||||
* @tparam block_size is the blocksize of the blockelements in the matrix
|
||||
* @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
|
||||
*/
|
||||
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
|
||||
class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
|
||||
{
|
||||
int rank;
|
||||
MPI_Comm_rank(m_cpuOwnerOverlapCopy.communicator(), &rank);
|
||||
auto &ri = m_cpuOwnerOverlapCopy.remoteIndices();
|
||||
auto end = ri.end();
|
||||
std::vector<int> commpair_indicesCopyOnCPU;
|
||||
std::vector<int> commpair_indicesOwnerCPU;
|
||||
public:
|
||||
using X = CuVector<field_type>;
|
||||
|
||||
for(auto process = ri.begin(); process != end; ++process) {
|
||||
int size = 0;
|
||||
m_im[process->first] = std::pair(std::vector<int>(), std::vector<int>());
|
||||
for(int send = 0; send < 2; ++send) {
|
||||
auto remoteEnd = send ? process->second.first->end()
|
||||
: process->second.second->end();
|
||||
auto remote = send ? process->second.first->begin()
|
||||
: process->second.second->begin();
|
||||
|
||||
while(remote != remoteEnd) {
|
||||
if (send ? (remote->localIndexPair().local().attribute() == 1)
|
||||
: (remote->attribute() == 1)) {
|
||||
++size;
|
||||
if (send) {
|
||||
m_im[process->first].first.push_back(remote->localIndexPair().local().local());
|
||||
} else {
|
||||
m_im[process->first].second.push_back(remote->localIndexPair().local().local());
|
||||
}
|
||||
}
|
||||
++remote;
|
||||
}
|
||||
}
|
||||
GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
|
||||
: GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
|
||||
{
|
||||
}
|
||||
|
||||
int sendBufIdx = 0;
|
||||
int recvBufIdx = 0;
|
||||
for (auto it = m_im.begin(); it != m_im.end(); it++) {
|
||||
int noSend = it->second.first.size();
|
||||
int noRecv = it->second.second.size();
|
||||
|
||||
if (noSend + noRecv > 0) {
|
||||
messageInformation_.insert(
|
||||
std::make_pair(it->first,
|
||||
std::make_pair(MessageInformation(
|
||||
sendBufIdx * block_size,
|
||||
noSend * block_size * sizeof(field_type)),
|
||||
MessageInformation(
|
||||
recvBufIdx * block_size,
|
||||
noRecv * block_size * sizeof(field_type)))));
|
||||
|
||||
for(int x = 0; x < noSend; x++) {
|
||||
for(int bs = 0; bs < block_size; bs++) {
|
||||
commpair_indicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
|
||||
}
|
||||
}
|
||||
for(int x = 0; x < noRecv; x++) {
|
||||
for(int bs = 0; bs < block_size; bs++) {
|
||||
commpair_indicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
|
||||
}
|
||||
}
|
||||
sendBufIdx += noSend;
|
||||
recvBufIdx += noRecv;
|
||||
}
|
||||
void copyOwnerToAll(const X& source, X& dest) const override {
|
||||
// TODO: [perf] Can we reduce copying from the GPU here?
|
||||
// TODO: [perf] Maybe create a global buffer instead?
|
||||
auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
|
||||
auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
|
||||
this->m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
|
||||
dest.copyFromHost(destAsDuneVector);
|
||||
}
|
||||
|
||||
m_commpair_indicesCopy = std::make_unique<CuVector<int>>(commpair_indicesCopyOnCPU);
|
||||
m_commpair_indicesOwner = std::make_unique<CuVector<int>>(commpair_indicesOwnerCPU);
|
||||
|
||||
m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
|
||||
m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
|
||||
}
|
||||
|
||||
void initIndexSet() const
|
||||
private:
|
||||
void initIndexSet() const override
|
||||
{
|
||||
// We need indices that we we will use in the project, dot and norm calls.
|
||||
// TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
|
||||
const auto& pis = m_cpuOwnerOverlapCopy.indexSet();
|
||||
const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
|
||||
std::vector<int> indicesCopyOnCPU;
|
||||
std::vector<int> indicesOwnerCPU;
|
||||
for (const auto& index : pis) {
|
||||
@ -331,11 +151,265 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
|
||||
m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
|
||||
this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
|
||||
this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Derived class of GPUSender that handles MPI made with CUDA aware MPI
|
||||
* The copOwnerToAll function uses MPI calls refering to data that resides on the GPU in order
|
||||
* to send it directly to other GPUs, skipping the staging step on the CPU
|
||||
* @tparam field_type is float or double
|
||||
* @tparam block_size is the blocksize of the blockelements in the matrix
|
||||
* @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
|
||||
*/
|
||||
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
|
||||
class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
|
||||
{
|
||||
public:
|
||||
using X = CuVector<field_type>;
|
||||
|
||||
GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
|
||||
: GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
|
||||
{
|
||||
}
|
||||
|
||||
void copyOwnerToAll(const X& source, X& dest) const override
|
||||
{
|
||||
|
||||
OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!!
|
||||
std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
|
||||
|
||||
int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
|
||||
dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
|
||||
|
||||
// Start MPI stuff here...
|
||||
// Note: This has been taken from DUNE's parallel/communicator.hh
|
||||
std::vector<MPI_Request> sendRequests(m_messageInformation.size());
|
||||
std::vector<MPI_Request> recvRequests(m_messageInformation.size());
|
||||
std::vector<int> processMap(m_messageInformation.size());
|
||||
size_t numberOfRealRecvRequests = 0;
|
||||
|
||||
using const_iterator = typename InformationMap::const_iterator;
|
||||
const const_iterator end = m_messageInformation.end();
|
||||
|
||||
{
|
||||
size_t i = 0;
|
||||
for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
|
||||
processMap[i]=info->first;
|
||||
if(info->second.second.size_) {
|
||||
MPI_Irecv(m_GPURecvBuf->data()+info->second.second.start_,
|
||||
info->second.second.size_,
|
||||
MPI_BYTE,
|
||||
info->first,
|
||||
m_commTag,
|
||||
this->m_cpuOwnerOverlapCopy.communicator(),
|
||||
&recvRequests[i]);
|
||||
numberOfRealRecvRequests += 1;
|
||||
} else {
|
||||
recvRequests[i]=MPI_REQUEST_NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
size_t i = 0;
|
||||
for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
|
||||
if(info->second.first.size_) {
|
||||
MPI_Issend(m_GPUSendBuf->data()+info->second.first.start_,
|
||||
info->second.first.size_,
|
||||
MPI_BYTE,
|
||||
info->first,
|
||||
m_commTag,
|
||||
this->m_cpuOwnerOverlapCopy.communicator(),
|
||||
&sendRequests[i]);
|
||||
} else {
|
||||
sendRequests[i]=MPI_REQUEST_NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
int finished = MPI_UNDEFINED;
|
||||
MPI_Status status;
|
||||
for(size_t i = 0; i < numberOfRealRecvRequests; i++) {
|
||||
status.MPI_ERROR=MPI_SUCCESS;
|
||||
MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
|
||||
|
||||
if(status.MPI_ERROR!=MPI_SUCCESS) {
|
||||
std::cerr<< rank << ": MPI_Error occurred while receiving message from "<< processMap[finished] << std::endl;
|
||||
OPM_THROW(std::runtime_error, "MPI_Error while receiving message");
|
||||
}
|
||||
}
|
||||
MPI_Status recvStatus;
|
||||
for(size_t i = 0; i < m_messageInformation.size(); i++) {
|
||||
if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
|
||||
std::cerr << rank << ": MPI_Error occurred while sending message to " << processMap[finished] << std::endl;
|
||||
OPM_THROW(std::runtime_error, "MPI_Error while sending message");
|
||||
}
|
||||
}
|
||||
// ...End of MPI stuff
|
||||
|
||||
dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
|
||||
}
|
||||
|
||||
private:
|
||||
mutable std::unique_ptr<CuVector<int>> m_commpairIndicesCopy;
|
||||
mutable std::unique_ptr<CuVector<int>> m_commpairIndicesOwner;
|
||||
mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
|
||||
mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
|
||||
|
||||
struct MessageInformation
|
||||
{
|
||||
MessageInformation() : start_(0), size_(0) {}
|
||||
MessageInformation(size_t start, size_t size) : start_(start), size_(size) {}
|
||||
size_t start_; // offset in elements of "field_type"
|
||||
size_t size_; // size in bytes
|
||||
};
|
||||
|
||||
using InformationMap = std::map<int,std::pair<MessageInformation,MessageInformation> >;
|
||||
mutable InformationMap m_messageInformation;
|
||||
using IM = std::map<int,std::pair<std::vector<int>,std::vector<int> > >;
|
||||
mutable IM m_im;
|
||||
|
||||
constexpr static int m_commTag = 0; // So says DUNE
|
||||
|
||||
void buildCommPairIdxs() const
|
||||
{
|
||||
auto &ri = this->m_cpuOwnerOverlapCopy.remoteIndices();
|
||||
std::vector<int> commpairIndicesCopyOnCPU;
|
||||
std::vector<int> commpairIndicesOwnerCPU;
|
||||
|
||||
for(auto process : ri) {
|
||||
int size = 0;
|
||||
m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
|
||||
for(int send = 0; send < 2; ++send) {
|
||||
auto remoteEnd = send ? process.second.first->end()
|
||||
: process.second.second->end();
|
||||
auto remote = send ? process.second.first->begin()
|
||||
: process.second.second->begin();
|
||||
|
||||
while(remote != remoteEnd) {
|
||||
if (send ? (remote->localIndexPair().local().attribute() == 1)
|
||||
: (remote->attribute() == 1)) {
|
||||
++size;
|
||||
if (send) {
|
||||
m_im[process.first].first.push_back(remote->localIndexPair().local().local());
|
||||
} else {
|
||||
m_im[process.first].second.push_back(remote->localIndexPair().local().local());
|
||||
}
|
||||
}
|
||||
++remote;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int sendBufIdx = 0;
|
||||
int recvBufIdx = 0;
|
||||
for (auto it = m_im.begin(); it != m_im.end(); it++) {
|
||||
int noSend = it->second.first.size();
|
||||
int noRecv = it->second.second.size();
|
||||
|
||||
if (noSend + noRecv > 0) {
|
||||
m_messageInformation.insert(
|
||||
std::make_pair(it->first,
|
||||
std::make_pair(MessageInformation(
|
||||
sendBufIdx * block_size,
|
||||
noSend * block_size * sizeof(field_type)),
|
||||
MessageInformation(
|
||||
recvBufIdx * block_size,
|
||||
noRecv * block_size * sizeof(field_type)))));
|
||||
|
||||
for(int x = 0; x < noSend; x++) {
|
||||
for(int bs = 0; bs < block_size; bs++) {
|
||||
commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
|
||||
}
|
||||
}
|
||||
for(int x = 0; x < noRecv; x++) {
|
||||
for(int bs = 0; bs < block_size; bs++) {
|
||||
commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
|
||||
}
|
||||
}
|
||||
sendBufIdx += noSend;
|
||||
recvBufIdx += noRecv;
|
||||
}
|
||||
}
|
||||
|
||||
m_commpairIndicesCopy = std::make_unique<CuVector<int>>(commpairIndicesCopyOnCPU);
|
||||
m_commpairIndicesOwner = std::make_unique<CuVector<int>>(commpairIndicesOwnerCPU);
|
||||
|
||||
m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
|
||||
m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
|
||||
}
|
||||
|
||||
void initIndexSet() const override
|
||||
{
|
||||
// We need indices that we we will use in the project, dot and norm calls.
|
||||
// TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
|
||||
const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
|
||||
std::vector<int> indicesCopyOnCPU;
|
||||
std::vector<int> indicesOwnerCPU;
|
||||
for (const auto& index : pis) {
|
||||
if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
|
||||
for (int component = 0; component < block_size; ++component) {
|
||||
indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
|
||||
}
|
||||
}
|
||||
|
||||
if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
|
||||
for (int component = 0; component < block_size; ++component) {
|
||||
indicesOwnerCPU.push_back(index.local().local() * block_size + component);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
|
||||
this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
|
||||
|
||||
buildCommPairIdxs();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
|
||||
*
|
||||
* This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
|
||||
* a Dune::OwnerOverlapCopyCommunication on CuVectors
|
||||
*
|
||||
* @note This currently only has the functionality to parallelize the linear solve.
|
||||
*
|
||||
* @tparam field_type should be a field_type supported by CuVector (double, float)
|
||||
* @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
|
||||
* @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
|
||||
*/
|
||||
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
|
||||
class CuOwnerOverlapCopy
|
||||
{
|
||||
public:
|
||||
using X = CuVector<field_type>;
|
||||
|
||||
CuOwnerOverlapCopy(std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> sender) : m_sender(sender){}
|
||||
|
||||
void copyOwnerToAll(const X& source, X& dest) const {
|
||||
m_sender->copyOwnerToAll(source, dest);
|
||||
}
|
||||
|
||||
void dot(const X& x, const X& y, field_type& output) const
|
||||
{
|
||||
m_sender->dot(x, y, output);
|
||||
}
|
||||
|
||||
field_type norm(const X& x) const
|
||||
{
|
||||
return m_sender->norm(x);
|
||||
}
|
||||
|
||||
void project(X& x) const
|
||||
{
|
||||
m_sender->project(x);
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
|
||||
};
|
||||
} // namespace Opm::cuistl
|
||||
#endif
|
||||
|
@ -33,8 +33,6 @@
|
||||
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/has_function.hpp>
|
||||
|
||||
|
||||
|
||||
namespace Opm::cuistl
|
||||
{
|
||||
//! @brief Wraps a CUDA solver to work with CPU data.
|
||||
@ -163,10 +161,31 @@ private:
|
||||
auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner();
|
||||
const auto& communication = m_opOnCPUWithMatrix.getCommunication();
|
||||
|
||||
// Temporary solution use the GPU Direct communication solely based on these prepcrosessor statements
|
||||
bool mpiMightBeSupportedDuringCompilation = true;
|
||||
bool mpiMightBeSupportedDuringRuntime = true;
|
||||
|
||||
#if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
|
||||
mpiMightBeSupportedDuringCompilation = false;
|
||||
#endif /* MPIX_CUDA_AWARE_SUPPORT */
|
||||
|
||||
#if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_Query_cuda_support
|
||||
mpiMightBeSupportedDuringRuntime = false;
|
||||
#endif /* MPIX_CUDA_AWARE_SUPPORT */
|
||||
|
||||
// TODO add typename Operator communication type as a named type with using
|
||||
std::shared_ptr<Opm::cuistl::GPUSender<real_type, typename Operator::communication_type>> gpuComm;
|
||||
if (mpiMightBeSupportedDuringCompilation && mpiMightBeSupportedDuringRuntime){
|
||||
gpuComm = std::make_shared<Opm::cuistl::GPUAwareMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
|
||||
}
|
||||
else{
|
||||
gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
|
||||
}
|
||||
|
||||
using CudaCommunication = CuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
|
||||
using SchwarzOperator
|
||||
= Dune::OverlappingSchwarzOperator<CuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
|
||||
auto cudaCommunication = std::make_shared<CudaCommunication>(communication);
|
||||
auto cudaCommunication = std::make_shared<CudaCommunication>(gpuComm);
|
||||
|
||||
auto mpiPreconditioner = std::make_shared<CuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
|
||||
preconditionerReallyOnGPU, cudaCommunication);
|
||||
|
@ -29,7 +29,9 @@
|
||||
#include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/set_device.hpp>
|
||||
#include <random>
|
||||
#include <mpi.h>
|
||||
|
||||
bool
|
||||
init_unit_test_func()
|
||||
@ -41,6 +43,10 @@ int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
[[maybe_unused]] const auto& helper = Dune::MPIHelper::instance(argc, argv);
|
||||
int rank, totalRanks;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &totalRanks);
|
||||
Opm::cuistl::setDevice(rank, totalRanks);
|
||||
boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv);
|
||||
}
|
||||
|
||||
@ -58,8 +64,10 @@ BOOST_AUTO_TEST_CASE(TestProject)
|
||||
auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
|
||||
auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
|
||||
|
||||
auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
|
||||
|
||||
auto cuOwnerOverlapCopy
|
||||
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
|
||||
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);
|
||||
|
||||
cuOwnerOverlapCopy.project(xGPU);
|
||||
|
||||
@ -88,8 +96,10 @@ BOOST_AUTO_TEST_CASE(TestDot)
|
||||
auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
|
||||
auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
|
||||
|
||||
auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
|
||||
|
||||
auto cuOwnerOverlapCopy
|
||||
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
|
||||
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);
|
||||
|
||||
double outputDune = -1.0;
|
||||
auto xDune = xGPU.asDuneBlockVector<1>();
|
||||
@ -100,5 +110,4 @@ BOOST_AUTO_TEST_CASE(TestDot)
|
||||
|
||||
|
||||
BOOST_CHECK_EQUAL(outputDune, output);
|
||||
BOOST_CHECK_EQUAL(4.0, output);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user