diff --git a/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp b/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp index 36b41dcac..e6853f297 100644 --- a/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp +++ b/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp @@ -22,32 +22,46 @@ #include #include #include +#include namespace Opm::cuistl { /** - * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication - * - * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as - * a Dune::OwnerOverlapCopyCommunication on CuVectors - * - * - * @note This currently only has the functionality to parallelize the linear solve. - * - * @tparam field_type should be a field_type supported by CuVector (double, float) - * @tparam block_size the block size used (this is relevant for say figuring out the correct indices) - * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication. - */ -template -class CuOwnerOverlapCopy -{ + * @brief GPUSender is a wrapper class for classes which will implement copOwnerToAll + * This is implemented with the intention of creating communicators with generic GPUSender + * To hide implementation that will either use GPU aware MPI or not + * @tparam field_type is float or double + * @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type +*/ +template +class GPUSender { public: using X = CuVector; - CuOwnerOverlapCopy(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) - : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy) + GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){} + + /** + * @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on + * the copied data, and copy the result back to the GPU + * @param[in] source + * @param[out] dest + */ + virtual void copyOwnerToAll(const X& source, X& dest) const = 0; + virtual void initIndexSet() const = 0; + + /** + * @brief project will project x to the owned subspace + * + * For each component i which is not owned, x_i will be set to 0 + * + * @param[inout] x the vector to project + */ + void project(X& x) const { + std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); }); + x.setZeroAtIndexSet(*m_indicesCopy); } + /** * @brief dot will carry out the dot product between x and y on the owned indices, then sum up the result across MPI * processes. @@ -62,7 +76,7 @@ public: const auto dotAtRank = x.dot(y, *m_indicesOwner); output = m_cpuOwnerOverlapCopy.communicator().sum(dotAtRank); } - + /** * @brief norm computes the l^2-norm of x across processes. * @@ -74,247 +88,53 @@ public: auto xDotX = field_type(0); this->dot(x, x, xDotX); - using std::sqrt; - return sqrt(xDotX); + // using std::sqrt; + return std::sqrt(xDotX); } - /** - * @brief project will project x to the owned subspace - * - * For each component i which is not owned, x_i will be set to 0 - * - * @param[inout] x the vector to project - */ - void project(X& x) const - { - std::call_once(m_initializedIndices, [&]() { initIndexSet(); }); - x.setZeroAtIndexSet(*m_indicesCopy); - } - - /** - * @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on - * the copied data, and copy the result back to the GPU - * @param[in] source - * @param[out] dest - */ - void copyOwnerToAll_orig(const X& source, X& dest) const - { - // TODO: [perf] Can we reduce copying from the GPU here? - // TODO: [perf] Maybe create a global buffer instead? - auto sourceAsDuneVector = source.template asDuneBlockVector(); - auto destAsDuneVector = dest.template asDuneBlockVector(); - m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector); - dest.copyFromHost(destAsDuneVector); - } - - // Georgs new code intended to use GPU direct - void copyOwnerToAll(const X& source, X& dest) const - { - - printf("\n\nGPU DIRECT CODE IS RUN\n\n"); - printf("Compile time check:\n"); -#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT - printf("This MPI library has CUDA-aware support.\n", MPIX_CUDA_AWARE_SUPPORT); -#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT - printf("This MPI library does not have CUDA-aware support.\n"); -#else - printf("This MPI library cannot determine if there is CUDA-aware support.\n"); -#endif /* MPIX_CUDA_AWARE_SUPPORT */ - - printf("Run time check:\n"); -#if defined(MPIX_CUDA_AWARE_SUPPORT) - if (1 == MPIX_Query_cuda_support()) { - printf("This MPI library has CUDA-aware support.\n"); - } else { - printf("This MPI library does not have CUDA-aware support.\n"); - } -#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */ - printf("This MPI library cannot determine if there is CUDA-aware support.\n"); -#endif /* MPIX_CUDA_AWARE_SUPPORT */ - - - assert(&source == &dest); // In this context, source == dest!!! - std::call_once(m_initializedIndices, [&]() { initIndexSet(); }); - - int rank; - MPI_Comm_rank(m_cpuOwnerOverlapCopy.communicator(), &rank); - dest.prepareSendBuf(*m_GPUSendBuf, *m_commpair_indicesOwner); - - // Start MPI stuff here... - // Note: This has been taken from DUNE's parallel/communicator.hh - MPI_Request* sendRequests = new MPI_Request[messageInformation_.size()]; - MPI_Request* recvRequests = new MPI_Request[messageInformation_.size()]; - size_t numberOfRealRecvRequests = 0; - - typedef typename InformationMap::const_iterator const_iterator; - const const_iterator end = messageInformation_.end(); - size_t i=0; - int* processMap = new int[messageInformation_.size()]; - for(const_iterator info = messageInformation_.begin(); info != end; ++info, ++i) { - processMap[i]=info->first; - if(info->second.second.size_) { - MPI_Irecv(m_GPURecvBuf->data()+info->second.second.start_, - info->second.second.size_, - MPI_BYTE, - info->first, - commTag_, - m_cpuOwnerOverlapCopy.communicator(), - &recvRequests[i]); - numberOfRealRecvRequests += 1; - } else { - recvRequests[i]=MPI_REQUEST_NULL; - } - } - - i=0; - for(const_iterator info = messageInformation_.begin(); info != end; ++info, ++i) { - if(info->second.first.size_) { - MPI_Issend(m_GPUSendBuf->data()+info->second.first.start_, - info->second.first.size_, - MPI_BYTE, - info->first, - commTag_, - m_cpuOwnerOverlapCopy.communicator(), - &sendRequests[i]); - } else { - sendRequests[i]=MPI_REQUEST_NULL; - } - } - i=0; - int finished = MPI_UNDEFINED; - MPI_Status status; - for(i=0; i< numberOfRealRecvRequests; i++) { - status.MPI_ERROR=MPI_SUCCESS; - MPI_Waitany(messageInformation_.size(), recvRequests, &finished, &status); - - if(status.MPI_ERROR!=MPI_SUCCESS) { - std::cerr<< rank << ": MPI_Error occurred while receiving message from "<< processMap[finished] << std::endl; - assert(false); - } - } - MPI_Status recvStatus; - for(i=0; i< messageInformation_.size(); i++) { - if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) { - std::cerr << rank << ": MPI_Error occurred while sending message to " << processMap[finished] << std::endl; - assert(false); - } - } - delete[] processMap; - delete[] sendRequests; - delete[] recvRequests; - // ...End of MPI stuff - - dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpair_indicesCopy); - } - - - -private: - const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy; - +protected: // Used to call the initIndexSet. Note that this is kind of a // premature optimization, in the sense that we could just initialize these indices // always, but they are not always used. mutable std::once_flag m_initializedIndices; - mutable std::unique_ptr> m_indicesCopy; mutable std::unique_ptr> m_indicesOwner; + mutable std::unique_ptr> m_indicesCopy; + const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy; +}; - mutable std::unique_ptr> m_commpair_indicesCopy; - mutable std::unique_ptr> m_commpair_indicesOwner; - mutable std::unique_ptr> m_GPUSendBuf; - mutable std::unique_ptr> m_GPURecvBuf; +/** + * @brief Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin + * The implementation moves data fromthe GPU to the CPU and then sends it using regular MPI + * @tparam field_type is float or double + * @tparam block_size is the blocksize of the blockelements in the matrix + * @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type +*/ +template +class GPUObliviousMPISender : public GPUSender +{ +public: + using X = CuVector; - struct MessageInformation - { - MessageInformation() : start_(0), size_(0) {} - MessageInformation(size_t start, size_t size) : start_(start), size_(size) {} - size_t start_; // offset in elements of "field_type" - size_t size_; // size in bytes - }; - - typedef std::map > InformationMap; - mutable InformationMap messageInformation_; - typedef std::map,std::vector > > IM; - mutable IM m_im; - - constexpr static int commTag_ = 0; // So says DUNE - - void buildCommPairIdxs() const - { - int rank; - MPI_Comm_rank(m_cpuOwnerOverlapCopy.communicator(), &rank); - auto &ri = m_cpuOwnerOverlapCopy.remoteIndices(); - auto end = ri.end(); - std::vector commpair_indicesCopyOnCPU; - std::vector commpair_indicesOwnerCPU; - - for(auto process = ri.begin(); process != end; ++process) { - int size = 0; - m_im[process->first] = std::pair(std::vector(), std::vector()); - for(int send = 0; send < 2; ++send) { - auto remoteEnd = send ? process->second.first->end() - : process->second.second->end(); - auto remote = send ? process->second.first->begin() - : process->second.second->begin(); - - while(remote != remoteEnd) { - if (send ? (remote->localIndexPair().local().attribute() == 1) - : (remote->attribute() == 1)) { - ++size; - if (send) { - m_im[process->first].first.push_back(remote->localIndexPair().local().local()); - } else { - m_im[process->first].second.push_back(remote->localIndexPair().local().local()); - } - } - ++remote; - } - } + GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) + : GPUSender(cpuOwnerOverlapCopy) + { } - int sendBufIdx = 0; - int recvBufIdx = 0; - for (auto it = m_im.begin(); it != m_im.end(); it++) { - int noSend = it->second.first.size(); - int noRecv = it->second.second.size(); - - if (noSend + noRecv > 0) { - messageInformation_.insert( - std::make_pair(it->first, - std::make_pair(MessageInformation( - sendBufIdx * block_size, - noSend * block_size * sizeof(field_type)), - MessageInformation( - recvBufIdx * block_size, - noRecv * block_size * sizeof(field_type))))); - - for(int x = 0; x < noSend; x++) { - for(int bs = 0; bs < block_size; bs++) { - commpair_indicesOwnerCPU.push_back(it->second.first[x] * block_size + bs); - } - } - for(int x = 0; x < noRecv; x++) { - for(int bs = 0; bs < block_size; bs++) { - commpair_indicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs); - } - } - sendBufIdx += noSend; - recvBufIdx += noRecv; - } - } - - m_commpair_indicesCopy = std::make_unique>(commpair_indicesCopyOnCPU); - m_commpair_indicesOwner = std::make_unique>(commpair_indicesOwnerCPU); - - m_GPUSendBuf = std::make_unique>(sendBufIdx * block_size); - m_GPURecvBuf = std::make_unique>(recvBufIdx * block_size); + void copyOwnerToAll(const X& source, X& dest) const override { + // TODO: [perf] Can we reduce copying from the GPU here? + // TODO: [perf] Maybe create a global buffer instead? + auto sourceAsDuneVector = source.template asDuneBlockVector(); + auto destAsDuneVector = dest.template asDuneBlockVector(); + this->m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector); + dest.copyFromHost(destAsDuneVector); } - void initIndexSet() const +private: + void initIndexSet() const override { // We need indices that we we will use in the project, dot and norm calls. // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time? - const auto& pis = m_cpuOwnerOverlapCopy.indexSet(); + const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet(); std::vector indicesCopyOnCPU; std::vector indicesOwnerCPU; for (const auto& index : pis) { @@ -331,11 +151,265 @@ private: } } - m_indicesCopy = std::make_unique>(indicesCopyOnCPU); - m_indicesOwner = std::make_unique>(indicesOwnerCPU); + this->m_indicesCopy = std::make_unique>(indicesCopyOnCPU); + this->m_indicesOwner = std::make_unique>(indicesOwnerCPU); + } +}; + +/** + * @brief Derived class of GPUSender that handles MPI made with CUDA aware MPI + * The copOwnerToAll function uses MPI calls refering to data that resides on the GPU in order + * to send it directly to other GPUs, skipping the staging step on the CPU + * @tparam field_type is float or double + * @tparam block_size is the blocksize of the blockelements in the matrix + * @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type +*/ +template +class GPUAwareMPISender : public GPUSender +{ +public: + using X = CuVector; + + GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) + : GPUSender(cpuOwnerOverlapCopy) + { + } + + void copyOwnerToAll(const X& source, X& dest) const override + { + + OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!! + std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); }); + + int rank = this->m_cpuOwnerOverlapCopy.communicator().rank(); + dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner); + + // Start MPI stuff here... + // Note: This has been taken from DUNE's parallel/communicator.hh + std::vector sendRequests(m_messageInformation.size()); + std::vector recvRequests(m_messageInformation.size()); + std::vector processMap(m_messageInformation.size()); + size_t numberOfRealRecvRequests = 0; + + using const_iterator = typename InformationMap::const_iterator; + const const_iterator end = m_messageInformation.end(); + + { + size_t i = 0; + for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) { + processMap[i]=info->first; + if(info->second.second.size_) { + MPI_Irecv(m_GPURecvBuf->data()+info->second.second.start_, + info->second.second.size_, + MPI_BYTE, + info->first, + m_commTag, + this->m_cpuOwnerOverlapCopy.communicator(), + &recvRequests[i]); + numberOfRealRecvRequests += 1; + } else { + recvRequests[i]=MPI_REQUEST_NULL; + } + } + } + + { + size_t i = 0; + for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) { + if(info->second.first.size_) { + MPI_Issend(m_GPUSendBuf->data()+info->second.first.start_, + info->second.first.size_, + MPI_BYTE, + info->first, + m_commTag, + this->m_cpuOwnerOverlapCopy.communicator(), + &sendRequests[i]); + } else { + sendRequests[i]=MPI_REQUEST_NULL; + } + } + } + int finished = MPI_UNDEFINED; + MPI_Status status; + for(size_t i = 0; i < numberOfRealRecvRequests; i++) { + status.MPI_ERROR=MPI_SUCCESS; + MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status); + + if(status.MPI_ERROR!=MPI_SUCCESS) { + std::cerr<< rank << ": MPI_Error occurred while receiving message from "<< processMap[finished] << std::endl; + OPM_THROW(std::runtime_error, "MPI_Error while receiving message"); + } + } + MPI_Status recvStatus; + for(size_t i = 0; i < m_messageInformation.size(); i++) { + if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) { + std::cerr << rank << ": MPI_Error occurred while sending message to " << processMap[finished] << std::endl; + OPM_THROW(std::runtime_error, "MPI_Error while sending message"); + } + } + // ...End of MPI stuff + + dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy); + } + +private: + mutable std::unique_ptr> m_commpairIndicesCopy; + mutable std::unique_ptr> m_commpairIndicesOwner; + mutable std::unique_ptr> m_GPUSendBuf; + mutable std::unique_ptr> m_GPURecvBuf; + + struct MessageInformation + { + MessageInformation() : start_(0), size_(0) {} + MessageInformation(size_t start, size_t size) : start_(start), size_(size) {} + size_t start_; // offset in elements of "field_type" + size_t size_; // size in bytes + }; + + using InformationMap = std::map >; + mutable InformationMap m_messageInformation; + using IM = std::map,std::vector > >; + mutable IM m_im; + + constexpr static int m_commTag = 0; // So says DUNE + + void buildCommPairIdxs() const + { + auto &ri = this->m_cpuOwnerOverlapCopy.remoteIndices(); + std::vector commpairIndicesCopyOnCPU; + std::vector commpairIndicesOwnerCPU; + + for(auto process : ri) { + int size = 0; + m_im[process.first] = std::pair(std::vector(), std::vector()); + for(int send = 0; send < 2; ++send) { + auto remoteEnd = send ? process.second.first->end() + : process.second.second->end(); + auto remote = send ? process.second.first->begin() + : process.second.second->begin(); + + while(remote != remoteEnd) { + if (send ? (remote->localIndexPair().local().attribute() == 1) + : (remote->attribute() == 1)) { + ++size; + if (send) { + m_im[process.first].first.push_back(remote->localIndexPair().local().local()); + } else { + m_im[process.first].second.push_back(remote->localIndexPair().local().local()); + } + } + ++remote; + } + } + } + + int sendBufIdx = 0; + int recvBufIdx = 0; + for (auto it = m_im.begin(); it != m_im.end(); it++) { + int noSend = it->second.first.size(); + int noRecv = it->second.second.size(); + + if (noSend + noRecv > 0) { + m_messageInformation.insert( + std::make_pair(it->first, + std::make_pair(MessageInformation( + sendBufIdx * block_size, + noSend * block_size * sizeof(field_type)), + MessageInformation( + recvBufIdx * block_size, + noRecv * block_size * sizeof(field_type))))); + + for(int x = 0; x < noSend; x++) { + for(int bs = 0; bs < block_size; bs++) { + commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs); + } + } + for(int x = 0; x < noRecv; x++) { + for(int bs = 0; bs < block_size; bs++) { + commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs); + } + } + sendBufIdx += noSend; + recvBufIdx += noRecv; + } + } + + m_commpairIndicesCopy = std::make_unique>(commpairIndicesCopyOnCPU); + m_commpairIndicesOwner = std::make_unique>(commpairIndicesOwnerCPU); + + m_GPUSendBuf = std::make_unique>(sendBufIdx * block_size); + m_GPURecvBuf = std::make_unique>(recvBufIdx * block_size); + } + + void initIndexSet() const override + { + // We need indices that we we will use in the project, dot and norm calls. + // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time? + const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet(); + std::vector indicesCopyOnCPU; + std::vector indicesOwnerCPU; + for (const auto& index : pis) { + if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) { + for (int component = 0; component < block_size; ++component) { + indicesCopyOnCPU.push_back(index.local().local() * block_size + component); + } + } + + if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) { + for (int component = 0; component < block_size; ++component) { + indicesOwnerCPU.push_back(index.local().local() * block_size + component); + } + } + } + + this->m_indicesCopy = std::make_unique>(indicesCopyOnCPU); + this->m_indicesOwner = std::make_unique>(indicesOwnerCPU); buildCommPairIdxs(); } }; + +/** + * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication + * + * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as + * a Dune::OwnerOverlapCopyCommunication on CuVectors + * + * @note This currently only has the functionality to parallelize the linear solve. + * + * @tparam field_type should be a field_type supported by CuVector (double, float) + * @tparam block_size the block size used (this is relevant for say figuring out the correct indices) + * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication. + */ +template +class CuOwnerOverlapCopy +{ +public: + using X = CuVector; + + CuOwnerOverlapCopy(std::shared_ptr> sender) : m_sender(sender){} + + void copyOwnerToAll(const X& source, X& dest) const { + m_sender->copyOwnerToAll(source, dest); + } + + void dot(const X& x, const X& y, field_type& output) const + { + m_sender->dot(x, y, output); + } + + field_type norm(const X& x) const + { + return m_sender->norm(x); + } + + void project(X& x) const + { + m_sender->project(x); + } + +private: + std::shared_ptr> m_sender; +}; } // namespace Opm::cuistl #endif diff --git a/opm/simulators/linalg/cuistl/SolverAdapter.hpp b/opm/simulators/linalg/cuistl/SolverAdapter.hpp index df7984226..322d2e7e9 100644 --- a/opm/simulators/linalg/cuistl/SolverAdapter.hpp +++ b/opm/simulators/linalg/cuistl/SolverAdapter.hpp @@ -33,8 +33,6 @@ #include #include - - namespace Opm::cuistl { //! @brief Wraps a CUDA solver to work with CPU data. @@ -163,10 +161,31 @@ private: auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner(); const auto& communication = m_opOnCPUWithMatrix.getCommunication(); + // Temporary solution use the GPU Direct communication solely based on these prepcrosessor statements + bool mpiMightBeSupportedDuringCompilation = true; + bool mpiMightBeSupportedDuringRuntime = true; + + #if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT + mpiMightBeSupportedDuringCompilation = false; + #endif /* MPIX_CUDA_AWARE_SUPPORT */ + + #if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_Query_cuda_support + mpiMightBeSupportedDuringRuntime = false; + #endif /* MPIX_CUDA_AWARE_SUPPORT */ + + // TODO add typename Operator communication type as a named type with using + std::shared_ptr> gpuComm; + if (mpiMightBeSupportedDuringCompilation && mpiMightBeSupportedDuringRuntime){ + gpuComm = std::make_shared>(communication); + } + else{ + gpuComm = std::make_shared>(communication); + } + using CudaCommunication = CuOwnerOverlapCopy; using SchwarzOperator = Dune::OverlappingSchwarzOperator, XGPU, XGPU, CudaCommunication>; - auto cudaCommunication = std::make_shared(communication); + auto cudaCommunication = std::make_shared(gpuComm); auto mpiPreconditioner = std::make_shared>( preconditionerReallyOnGPU, cudaCommunication); diff --git a/tests/cuistl/test_cuowneroverlapcopy.cpp b/tests/cuistl/test_cuowneroverlapcopy.cpp index 74e127444..5024c92ac 100644 --- a/tests/cuistl/test_cuowneroverlapcopy.cpp +++ b/tests/cuistl/test_cuowneroverlapcopy.cpp @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include bool init_unit_test_func() @@ -41,6 +43,10 @@ int main(int argc, char** argv) { [[maybe_unused]] const auto& helper = Dune::MPIHelper::instance(argc, argv); + int rank, totalRanks; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &totalRanks); + Opm::cuistl::setDevice(rank, totalRanks); boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv); } @@ -58,8 +64,10 @@ BOOST_AUTO_TEST_CASE(TestProject) auto xCPU = std::vector {{1.0, 2.0, 3.0}}; auto xGPU = Opm::cuistl::CuVector(xCPU); + auto gpuComm = std::make_shared>>(ownerOverlapCopy); + auto cuOwnerOverlapCopy - = Opm::cuistl::CuOwnerOverlapCopy>(ownerOverlapCopy); + = Opm::cuistl::CuOwnerOverlapCopy>(gpuComm); cuOwnerOverlapCopy.project(xGPU); @@ -88,8 +96,10 @@ BOOST_AUTO_TEST_CASE(TestDot) auto xCPU = std::vector {{1.0, 2.0, 3.0}}; auto xGPU = Opm::cuistl::CuVector(xCPU); + auto gpuComm = std::make_shared>>(ownerOverlapCopy); + auto cuOwnerOverlapCopy - = Opm::cuistl::CuOwnerOverlapCopy>(ownerOverlapCopy); + = Opm::cuistl::CuOwnerOverlapCopy>(gpuComm); double outputDune = -1.0; auto xDune = xGPU.asDuneBlockVector<1>(); @@ -100,5 +110,4 @@ BOOST_AUTO_TEST_CASE(TestDot) BOOST_CHECK_EQUAL(outputDune, output); - BOOST_CHECK_EQUAL(4.0, output); }