Merge pull request #5145 from multitalentloes/add_gpu_direct

Add GPU Direct
This commit is contained in:
Atgeirr Flø Rasmussen 2024-04-29 14:08:25 +02:00 committed by GitHub
commit da5f20a0d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 427 additions and 64 deletions

View File

@ -22,32 +22,46 @@
#include <memory>
#include <mutex>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <vector>
namespace Opm::cuistl
{
/**
* @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
*
* This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
* a Dune::OwnerOverlapCopyCommunication on CuVectors
*
*
* @note This currently only has the functionality to parallelize the linear solve.
*
* @tparam field_type should be a field_type supported by CuVector (double, float)
* @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
* @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
*/
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
class CuOwnerOverlapCopy
{
* @brief GPUSender is a wrapper class for classes which will implement copOwnerToAll
* This is implemented with the intention of creating communicators with generic GPUSender
* To hide implementation that will either use GPU aware MPI or not
* @tparam field_type is float or double
* @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
*/
template<class field_type, class OwnerOverlapCopyCommunicationType>
class GPUSender {
public:
using X = CuVector<field_type>;
CuOwnerOverlapCopy(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
: m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy)
GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}
/**
* @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
* the copied data, and copy the result back to the GPU
* @param[in] source
* @param[out] dest
*/
virtual void copyOwnerToAll(const X& source, X& dest) const = 0;
virtual void initIndexSet() const = 0;
/**
* @brief project will project x to the owned subspace
*
* For each component i which is not owned, x_i will be set to 0
*
* @param[inout] x the vector to project
*/
void project(X& x) const
{
std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
x.setZeroAtIndexSet(*m_indicesCopy);
}
/**
* @brief dot will carry out the dot product between x and y on the owned indices, then sum up the result across MPI
* processes.
@ -72,59 +86,55 @@ public:
field_type norm(const X& x) const
{
auto xDotX = field_type(0);
this->dot(x, x, xDotX);
dot(x, x, xDotX);
using std::sqrt;
return sqrt(xDotX);
// using std::sqrt;
return std::sqrt(xDotX);
}
/**
* @brief project will project x to the owned subspace
*
* For each component i which is not owned, x_i will be set to 0
*
* @param[inout] x the vector to project
*/
void project(X& x) const
{
std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
x.setZeroAtIndexSet(*m_indicesCopy);
}
/**
* @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
* the copied data, and copy the result back to the GPU
* @param[in] source
* @param[out] dest
*/
void copyOwnerToAll(const X& source, X& dest) const
{
// TODO: [perf] Can we reduce copying from the GPU here?
// TODO: [perf] Maybe create a global buffer instead?
auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
dest.copyFromHost(destAsDuneVector);
}
private:
const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
protected:
// Used to call the initIndexSet. Note that this is kind of a
// premature optimization, in the sense that we could just initialize these indices
// always, but they are not always used.
mutable std::once_flag m_initializedIndices;
mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
};
/**
* @brief Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin
* The implementation moves data fromthe GPU to the CPU and then sends it using regular MPI
* @tparam field_type is float or double
* @tparam block_size is the blocksize of the blockelements in the matrix
* @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
*/
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
{
public:
using X = CuVector<field_type>;
void initIndexSet() const
GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
: GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
{
}
void copyOwnerToAll(const X& source, X& dest) const override {
// TODO: [perf] Can we reduce copying from the GPU here?
// TODO: [perf] Maybe create a global buffer instead?
auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
this->m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
dest.copyFromHost(destAsDuneVector);
}
private:
void initIndexSet() const override
{
// We need indices that we we will use in the project, dot and norm calls.
// TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
const auto& pis = m_cpuOwnerOverlapCopy.indexSet();
const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
std::vector<int> indicesCopyOnCPU;
std::vector<int> indicesOwnerCPU;
for (const auto& index : pis) {
@ -141,9 +151,263 @@ private:
}
}
m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
}
};
/**
* @brief Derived class of GPUSender that handles MPI made with CUDA aware MPI
* The copOwnerToAll function uses MPI calls refering to data that resides on the GPU in order
* to send it directly to other GPUs, skipping the staging step on the CPU
* @tparam field_type is float or double
* @tparam block_size is the blocksize of the blockelements in the matrix
* @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
*/
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
{
public:
using X = CuVector<field_type>;
GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
: GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
{
}
void copyOwnerToAll(const X& source, X& dest) const override
{
OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!!
std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
// Start MPI stuff here...
// Note: This has been taken from DUNE's parallel/communicator.hh
std::vector<MPI_Request> sendRequests(m_messageInformation.size());
std::vector<MPI_Request> recvRequests(m_messageInformation.size());
std::vector<int> processMap(m_messageInformation.size());
size_t numberOfRealRecvRequests = 0;
using const_iterator = typename InformationMap::const_iterator;
const const_iterator end = m_messageInformation.end();
{
size_t i = 0;
for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
processMap[i]=info->first;
if(info->second.second.m_size) {
MPI_Irecv(m_GPURecvBuf->data()+info->second.second.m_start,
detail::to_int(info->second.second.m_size),
MPI_BYTE,
info->first,
m_commTag,
this->m_cpuOwnerOverlapCopy.communicator(),
&recvRequests[i]);
numberOfRealRecvRequests += 1;
} else {
recvRequests[i]=MPI_REQUEST_NULL;
}
}
}
{
size_t i = 0;
for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
if(info->second.first.m_size) {
MPI_Issend(m_GPUSendBuf->data()+info->second.first.m_start,
detail::to_int(info->second.first.m_size),
MPI_BYTE,
info->first,
m_commTag,
this->m_cpuOwnerOverlapCopy.communicator(),
&sendRequests[i]);
} else {
sendRequests[i]=MPI_REQUEST_NULL;
}
}
}
int finished = MPI_UNDEFINED;
MPI_Status status;
for(size_t i = 0; i < numberOfRealRecvRequests; i++) {
status.MPI_ERROR=MPI_SUCCESS;
MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
if(status.MPI_ERROR!=MPI_SUCCESS) {
OPM_THROW(std::runtime_error, fmt::format("MPI_Error occurred while rank {} received a message from rank {}", rank, processMap[finished]));
}
}
MPI_Status recvStatus;
for(size_t i = 0; i < m_messageInformation.size(); i++) {
if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
OPM_THROW(std::runtime_error, fmt::format("MPI_Error occurred while rank {} sent a message from rank {}", rank, processMap[finished]));
}
}
// ...End of MPI stuff
dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
}
private:
mutable std::unique_ptr<CuVector<int>> m_commpairIndicesCopy;
mutable std::unique_ptr<CuVector<int>> m_commpairIndicesOwner;
mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
struct MessageInformation
{
MessageInformation() : m_start(0), m_size(0) {}
MessageInformation(size_t start, size_t size) : m_start(start), m_size(size) {}
size_t m_start; // offset in elements of "field_type"
size_t m_size; // size in bytes
};
using InformationMap = std::map<int,std::pair<MessageInformation,MessageInformation> >;
mutable InformationMap m_messageInformation;
using IM = std::map<int,std::pair<std::vector<int>,std::vector<int> > >;
mutable IM m_im;
constexpr static int m_commTag = 0; // So says DUNE
void buildCommPairIdxs() const
{
auto &ri = this->m_cpuOwnerOverlapCopy.remoteIndices();
std::vector<int> commpairIndicesCopyOnCPU;
std::vector<int> commpairIndicesOwnerCPU;
for(auto process : ri) {
int size = 0;
m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
for(int send = 0; send < 2; ++send) {
auto remoteEnd = send ? process.second.first->end()
: process.second.second->end();
auto remote = send ? process.second.first->begin()
: process.second.second->begin();
while(remote != remoteEnd) {
if (send ? (remote->localIndexPair().local().attribute() == 1)
: (remote->attribute() == 1)) {
++size;
if (send) {
m_im[process.first].first.push_back(remote->localIndexPair().local().local());
} else {
m_im[process.first].second.push_back(remote->localIndexPair().local().local());
}
}
++remote;
}
}
}
int sendBufIdx = 0;
int recvBufIdx = 0;
for (auto it = m_im.begin(); it != m_im.end(); it++) {
int noSend = it->second.first.size();
int noRecv = it->second.second.size();
if (noSend + noRecv > 0) {
m_messageInformation.insert(
std::make_pair(it->first,
std::make_pair(MessageInformation(
sendBufIdx * block_size,
noSend * block_size * sizeof(field_type)),
MessageInformation(
recvBufIdx * block_size,
noRecv * block_size * sizeof(field_type)))));
for(int x = 0; x < noSend; x++) {
for(int bs = 0; bs < block_size; bs++) {
commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
}
}
for(int x = 0; x < noRecv; x++) {
for(int bs = 0; bs < block_size; bs++) {
commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
}
}
sendBufIdx += noSend;
recvBufIdx += noRecv;
}
}
m_commpairIndicesCopy = std::make_unique<CuVector<int>>(commpairIndicesCopyOnCPU);
m_commpairIndicesOwner = std::make_unique<CuVector<int>>(commpairIndicesOwnerCPU);
m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
}
void initIndexSet() const override
{
// We need indices that we we will use in the project, dot and norm calls.
// TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
std::vector<int> indicesCopyOnCPU;
std::vector<int> indicesOwnerCPU;
for (const auto& index : pis) {
if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
for (int component = 0; component < block_size; ++component) {
indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
}
}
if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
for (int component = 0; component < block_size; ++component) {
indicesOwnerCPU.push_back(index.local().local() * block_size + component);
}
}
}
this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
buildCommPairIdxs();
}
};
/**
* @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
*
* This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
* a Dune::OwnerOverlapCopyCommunication on CuVectors
*
* @note This currently only has the functionality to parallelize the linear solve.
*
* @tparam field_type should be a field_type supported by CuVector (double, float)
* @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
* @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
*/
template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
class CuOwnerOverlapCopy
{
public:
using X = CuVector<field_type>;
CuOwnerOverlapCopy(std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> sender) : m_sender(sender){}
void copyOwnerToAll(const X& source, X& dest) const {
m_sender->copyOwnerToAll(source, dest);
}
void dot(const X& x, const X& y, field_type& output) const
{
m_sender->dot(x, y, output);
}
field_type norm(const X& x) const
{
return m_sender->norm(x);
}
void project(X& x) const
{
m_sender->project(x);
}
private:
std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
};
} // namespace Opm::cuistl
#endif

View File

@ -286,6 +286,20 @@ CuVector<T>::copyToHost(std::vector<T>& data) const
{
copyToHost(data.data(), data.size());
}
template <typename T>
void
CuVector<T>::prepareSendBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const
{
return detail::prepareSendBuf(m_dataOnDevice, buffer.data(), indexSet.dim(), indexSet.data());
}
template <typename T>
void
CuVector<T>::syncFromRecvBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const
{
return detail::syncFromRecvBuf(m_dataOnDevice, buffer.data(), indexSet.dim(), indexSet.data());
}
template class CuVector<double>;
template class CuVector<float>;
template class CuVector<int>;

View File

@ -231,6 +231,9 @@ public:
*/
void copyToHost(std::vector<T>& data) const;
void prepareSendBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const;
void syncFromRecvBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const;
/**
* @brief operator *= multiplies every element by scalar
* @param scalar the scalar to with which to multiply every element

View File

@ -33,7 +33,11 @@
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
#include <opm/simulators/linalg/cuistl/detail/has_function.hpp>
#ifdef OPEN_MPI
#if OPEN_MPI
#include "mpi-ext.h"
#endif
#endif
namespace Opm::cuistl
{
@ -163,10 +167,34 @@ private:
auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner();
const auto& communication = m_opOnCPUWithMatrix.getCommunication();
// Temporary solution use the GPU Direct communication solely based on these prepcrosessor statements
bool mpiSUpportsCudaAwareAtCompileTime = false;
bool mpiSupportsCudaAwareAtRunTime = false;
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
mpiSupportsCudaAwareAtCompileTime = true;
#endif /* MPIX_CUDA_AWARE_SUPPORT */
#if defined(MPIX_CUDA_AWARE_SUPPORT)
if (1 == MPIX_Query_cuda_support()) {
mpiSupportsCudaAwareAtRunTime = true;
}
#endif /* MPIX_CUDA_AWARE_SUPPORT */
// TODO add typename Operator communication type as a named type with using
std::shared_ptr<Opm::cuistl::GPUSender<real_type, typename Operator::communication_type>> gpuComm;
if (mpiSUpportsCudaAwareAtCompileTime && mpiSupportsCudaAwareAtRunTime){
gpuComm = std::make_shared<Opm::cuistl::GPUAwareMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
}
else{
gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
}
using CudaCommunication = CuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
using SchwarzOperator
= Dune::OverlappingSchwarzOperator<CuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
auto cudaCommunication = std::make_shared<CudaCommunication>(communication);
auto cudaCommunication = std::make_shared<CudaCommunication>(gpuComm);
auto mpiPreconditioner = std::make_shared<CuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
preconditionerReallyOnGPU, cudaCommunication);

View File

@ -91,6 +91,27 @@ namespace
const auto threads = getThreads(numberOfElements);
return (numberOfElements + threads - 1) / threads;
}
template <class T>
__global__ void
prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
if (globalIndex < numberOfElements) {
buffer[globalIndex] = a[indices[globalIndex]];
}
}
template <class T>
__global__ void
syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
if (globalIndex < numberOfElements) {
a[indices[globalIndex]] = buffer[globalIndex];
}
}
} // namespace
template <class T>
@ -132,6 +153,25 @@ template double innerProductAtIndices(const double*, const double*, double* buff
template float innerProductAtIndices(const float*, const float*, float* buffer, size_t, const int*);
template int innerProductAtIndices(const int*, const int*, int* buffer, size_t, const int*);
template <class T>
void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
{
prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
cudaDeviceSynchronize(); // The buffers are prepared for MPI. Wait for them to finish.
}
template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberOfElements, const int* indices);
template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices);
template <class T>
void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
{
syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
//cudaDeviceSynchronize(); // Not needed, I guess...
}
template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
template void syncFromRecvBuf(float* deviceA, float* buffer, size_t numberOfElements, const int* indices);
template void syncFromRecvBuf(int* deviceA, int* buffer, size_t numberOfElements, const int* indices);
template <class T>
void

View File

@ -55,6 +55,11 @@ void setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indice
template <class T>
T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices);
template <class T>
void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices);
template <class T>
void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices);
/**
* @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
* compute the Hadamard product.

View File

@ -29,7 +29,9 @@
#include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/set_device.hpp>
#include <random>
#include <mpi.h>
bool
init_unit_test_func()
@ -41,6 +43,10 @@ int
main(int argc, char** argv)
{
[[maybe_unused]] const auto& helper = Dune::MPIHelper::instance(argc, argv);
int rank, totalRanks;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &totalRanks);
Opm::cuistl::setDevice(rank, totalRanks);
boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv);
}
@ -58,8 +64,10 @@ BOOST_AUTO_TEST_CASE(TestProject)
auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
auto cuOwnerOverlapCopy
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);
cuOwnerOverlapCopy.project(xGPU);
@ -88,8 +96,10 @@ BOOST_AUTO_TEST_CASE(TestDot)
auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
auto cuOwnerOverlapCopy
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
= Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);
double outputDune = -1.0;
auto xDune = xGPU.asDuneBlockVector<1>();
@ -100,5 +110,4 @@ BOOST_AUTO_TEST_CASE(TestDot)
BOOST_CHECK_EQUAL(outputDune, output);
BOOST_CHECK_EQUAL(4.0, output);
}