Add classes handling correct MPI implementation

Make some changes to Georgs original code: dynamically allocated arrays with std::vectors instead Implement new class structure handling what MPI communication implementation to use create extra scopes to avoid reuse of index variable i Update related tests: Update test_cuowneroverlapcopy to account for new class strucutre Also remove line that invalidates the MPI tests for multiple processes
2025-02-25 18:55:30 -06:00 · 2024-02-01 11:47:55 +01:00 · 2024-02-01 11:47:55 +01:00 · 7235f34f0e
commit 7235f34f0e
parent eb6f9dc1f9
3 changed files with 353 additions and 251 deletions
--- a/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
+++ b/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
@ -22,32 +22,46 @@
 #include <memory>
 #include <mutex>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <vector>

 namespace Opm::cuistl
 {
 /**
- * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
- *
- * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
- * a Dune::OwnerOverlapCopyCommunication on CuVectors
- *
- *
- * @note This currently only has the functionality to parallelize the linear solve.
- *
- * @tparam field_type should be a field_type supported by CuVector (double, float)
- * @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
- * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
+ * @brief GPUSender is a wrapper class for classes which will implement copOwnerToAll
+ * This is implemented with the intention of creating communicators with generic GPUSender
+ * To hide implementation that will either use GPU aware MPI or not
+ * @tparam field_type is float or double
+ * @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
 */
-template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
-class CuOwnerOverlapCopy
-{
+template<class field_type, class OwnerOverlapCopyCommunicationType>
+class GPUSender {
 public:
    using X = CuVector<field_type>;

-    CuOwnerOverlapCopy(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
-        : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy)
+    GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}
+
+    /**
+     * @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
+     * the copied data, and copy the result back to the GPU
+     * @param[in] source
+     * @param[out] dest
+     */
+    virtual void copyOwnerToAll(const X& source, X& dest) const = 0;
+    virtual void initIndexSet() const = 0;
+
+    /**
+     * @brief project will project x to the owned subspace
+     *
+     * For each component i which is not owned, x_i will be set to 0
+     *
+     * @param[inout] x the vector to project
+     */
+    void project(X& x) const
    {
+        std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
+        x.setZeroAtIndexSet(*m_indicesCopy);
    }
+
    /**
     * @brief dot will carry out the dot product between x and y on the owned indices, then sum up the result across MPI
     * processes.
@ -74,247 +88,53 @@ public:
        auto xDotX = field_type(0);
        this->dot(x, x, xDotX);

-        using std::sqrt;
-        return sqrt(xDotX);
+        // using std::sqrt;
+        return std::sqrt(xDotX);
    }

-    /**
-     * @brief project will project x to the owned subspace
-     *
-     * For each component i which is not owned, x_i will be set to 0
-     *
-     * @param[inout] x the vector to project
-     */
-    void project(X& x) const
-    {
-        std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
-        x.setZeroAtIndexSet(*m_indicesCopy);
-    }
-
-    /**
-     * @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
-     * the copied data, and copy the result back to the GPU
-     * @param[in] source
-     * @param[out] dest
-     */
-    void copyOwnerToAll_orig(const X& source, X& dest) const
-    {
-        // TODO: [perf] Can we reduce copying from the GPU here?
-        // TODO: [perf] Maybe create a global buffer instead?
-        auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
-        auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
-        m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
-        dest.copyFromHost(destAsDuneVector);
-    }
-
-    // Georgs new code intended to use GPU direct
-    void copyOwnerToAll(const X& source, X& dest) const
-    {
-
-        printf("\n\nGPU DIRECT CODE IS RUN\n\n");
-            printf("Compile time check:\n");
-#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
-    printf("This MPI library has CUDA-aware support.\n", MPIX_CUDA_AWARE_SUPPORT);
-#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
-    printf("This MPI library does not have CUDA-aware support.\n");
-#else
-    printf("This MPI library cannot determine if there is CUDA-aware support.\n");
-#endif /* MPIX_CUDA_AWARE_SUPPORT */
- 
-    printf("Run time check:\n");
-#if defined(MPIX_CUDA_AWARE_SUPPORT)
-    if (1 == MPIX_Query_cuda_support()) {
-        printf("This MPI library has CUDA-aware support.\n");
-    } else {
-        printf("This MPI library does not have CUDA-aware support.\n");
-    }
-#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
-    printf("This MPI library cannot determine if there is CUDA-aware support.\n");
-#endif /* MPIX_CUDA_AWARE_SUPPORT */
-
-
-        assert(&source == &dest); // In this context, source == dest!!!
-        std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
-
-        int rank;
-        MPI_Comm_rank(m_cpuOwnerOverlapCopy.communicator(), &rank);
-        dest.prepareSendBuf(*m_GPUSendBuf, *m_commpair_indicesOwner);
-
-        // Start MPI stuff here...
-        // Note: This has been taken from DUNE's parallel/communicator.hh
-        MPI_Request* sendRequests = new MPI_Request[messageInformation_.size()];
-        MPI_Request* recvRequests = new MPI_Request[messageInformation_.size()];
-        size_t numberOfRealRecvRequests = 0;
-
-        typedef typename InformationMap::const_iterator const_iterator;
-        const const_iterator end = messageInformation_.end();
-        size_t i=0;
-        int* processMap = new int[messageInformation_.size()];
-        for(const_iterator info = messageInformation_.begin(); info != end; ++info, ++i) {
-            processMap[i]=info->first;
-            if(info->second.second.size_) {
-                MPI_Irecv(m_GPURecvBuf->data()+info->second.second.start_,
-                          info->second.second.size_,
-                          MPI_BYTE,
-                          info->first,
-                          commTag_,
-                          m_cpuOwnerOverlapCopy.communicator(),
-                          &recvRequests[i]);
-                numberOfRealRecvRequests += 1;
-            } else {
-                recvRequests[i]=MPI_REQUEST_NULL;
-            }
-        }
-
-        i=0;
-        for(const_iterator info = messageInformation_.begin(); info != end; ++info, ++i) {
-            if(info->second.first.size_) {
-                MPI_Issend(m_GPUSendBuf->data()+info->second.first.start_,
-                           info->second.first.size_,
-                           MPI_BYTE,
-                           info->first,
-                           commTag_,
-                           m_cpuOwnerOverlapCopy.communicator(),
-                           &sendRequests[i]);
-            } else {
-                sendRequests[i]=MPI_REQUEST_NULL;
-            }
-        }
-        i=0;
-        int finished = MPI_UNDEFINED;
-        MPI_Status status;
-        for(i=0; i< numberOfRealRecvRequests; i++) {
-            status.MPI_ERROR=MPI_SUCCESS;
-            MPI_Waitany(messageInformation_.size(), recvRequests, &finished, &status);
-
-            if(status.MPI_ERROR!=MPI_SUCCESS) {
-                std::cerr<< rank << ": MPI_Error occurred while receiving message from "<< processMap[finished] << std::endl;
-                assert(false);
-            }
-        }
-        MPI_Status recvStatus;
-        for(i=0; i< messageInformation_.size(); i++) {
-            if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
-                std::cerr << rank << ": MPI_Error occurred while sending message to " << processMap[finished] << std::endl;
-                assert(false);
-            }
-        }
-        delete[] processMap;
-        delete[] sendRequests;
-        delete[] recvRequests;
-        // ...End of MPI stuff
-
-        dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpair_indicesCopy);
-    }
-
-
-
-private:
-    const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
-
+protected:
    // Used to call the initIndexSet. Note that this is kind of a
    // premature optimization, in the sense that we could just initialize these indices
    // always, but they are not always used.
    mutable std::once_flag m_initializedIndices;
-    mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
    mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
-
-        mutable std::unique_ptr<CuVector<int>> m_commpair_indicesCopy;
-    mutable std::unique_ptr<CuVector<int>> m_commpair_indicesOwner;
-    mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
-    mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
-
-    struct MessageInformation
-    {
-        MessageInformation() : start_(0), size_(0) {}
-        MessageInformation(size_t start, size_t size) : start_(start), size_(size) {}
-        size_t start_; // offset in elements of "field_type"
-        size_t size_;  // size in bytes
+    mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
+    const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
 };

-    typedef std::map<int,std::pair<MessageInformation,MessageInformation> > InformationMap;
-    mutable InformationMap messageInformation_;
-    typedef std::map<int,std::pair<std::vector<int>,std::vector<int> > > IM;
-    mutable IM m_im;
-
-    constexpr static int commTag_ = 0; // So says DUNE
-
-    void buildCommPairIdxs() const
+/**
+ * @brief Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin
+ * The implementation moves data fromthe GPU to the CPU and then sends it using regular MPI
+ * @tparam field_type is float or double
+ * @tparam block_size is the blocksize of the blockelements in the matrix
+ * @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
+*/
+template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
+class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType> 
 {
-        int rank;
-        MPI_Comm_rank(m_cpuOwnerOverlapCopy.communicator(), &rank);
-        auto &ri = m_cpuOwnerOverlapCopy.remoteIndices();
-        auto end = ri.end();
-        std::vector<int> commpair_indicesCopyOnCPU;
-        std::vector<int> commpair_indicesOwnerCPU;
+public:
+    using X = CuVector<field_type>;

-        for(auto process = ri.begin(); process != end; ++process) {
-            int size = 0;
-            m_im[process->first] = std::pair(std::vector<int>(), std::vector<int>());
-            for(int send = 0; send < 2; ++send) {
-                auto remoteEnd = send ? process->second.first->end()
-                                      : process->second.second->end();
-                auto remote = send ? process->second.first->begin()
-                                   : process->second.second->begin();
-
-                while(remote != remoteEnd) {
-                    if (send ? (remote->localIndexPair().local().attribute() == 1)
-                             : (remote->attribute() == 1)) {
-                        ++size;
-                        if (send) {
-                            m_im[process->first].first.push_back(remote->localIndexPair().local().local()); 
-                        } else {
-                            m_im[process->first].second.push_back(remote->localIndexPair().local().local());
-                        }
-                    }
-                    ++remote;
-                }
-            }
+    GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) 
+        : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
+        {
        }

-        int sendBufIdx = 0;
-        int recvBufIdx = 0;
-        for (auto it = m_im.begin(); it != m_im.end(); it++) {
-            int noSend = it->second.first.size();
-            int noRecv = it->second.second.size();
-
-            if (noSend + noRecv > 0) {
-                messageInformation_.insert(
-                        std::make_pair(it->first,
-                                       std::make_pair(MessageInformation(
-                                                        sendBufIdx * block_size,
-                                                        noSend * block_size * sizeof(field_type)),
-                                                      MessageInformation(
-                                                        recvBufIdx * block_size,
-                                                        noRecv * block_size * sizeof(field_type)))));
-
-                for(int x = 0; x < noSend; x++) {
-                    for(int bs = 0; bs < block_size; bs++) {
-                        commpair_indicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
-                    }
-                }
-                for(int x = 0; x < noRecv; x++) {
-                    for(int bs = 0; bs < block_size; bs++) {
-                        commpair_indicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
-                    }
-                }
-                sendBufIdx += noSend;
-                recvBufIdx += noRecv;
-            }
+    void copyOwnerToAll(const X& source, X& dest) const override {
+        // TODO: [perf] Can we reduce copying from the GPU here?
+        // TODO: [perf] Maybe create a global buffer instead?
+        auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
+        auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
+        this->m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
+        dest.copyFromHost(destAsDuneVector);
    }

-        m_commpair_indicesCopy = std::make_unique<CuVector<int>>(commpair_indicesCopyOnCPU);
-        m_commpair_indicesOwner = std::make_unique<CuVector<int>>(commpair_indicesOwnerCPU);
-
-        m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
-        m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
-    }
-
-    void initIndexSet() const
+private:
+    void initIndexSet() const override
    {
        // We need indices that we we will use in the project, dot and norm calls.
        // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
-        const auto& pis = m_cpuOwnerOverlapCopy.indexSet();
+        const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
        std::vector<int> indicesCopyOnCPU;
        std::vector<int> indicesOwnerCPU;
        for (const auto& index : pis) {
@ -331,11 +151,265 @@ private:
            }
        }

-        m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
-        m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+        this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
+        this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+    }
+};
+
+/**
+ * @brief Derived class of GPUSender that handles MPI made with CUDA aware MPI
+ * The copOwnerToAll function uses MPI calls refering to data that resides on the GPU in order
+ * to send it directly to other GPUs, skipping the staging step on the CPU
+ * @tparam field_type is float or double
+ * @tparam block_size is the blocksize of the blockelements in the matrix
+ * @tparam OwnerOverlapCopyCommunicationType is typically a Dune::LinearOperator::communication_type
+*/
+template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
+class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
+{
+public:
+    using X = CuVector<field_type>;
+
+    GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
+        : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
+    {
+    }
+
+    void copyOwnerToAll(const X& source, X& dest) const override
+    {
+
+        OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!!
+        std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
+
+        int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
+        dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
+
+        // Start MPI stuff here...
+        // Note: This has been taken from DUNE's parallel/communicator.hh
+        std::vector<MPI_Request> sendRequests(m_messageInformation.size());
+        std::vector<MPI_Request> recvRequests(m_messageInformation.size());
+        std::vector<int> processMap(m_messageInformation.size());
+        size_t numberOfRealRecvRequests = 0;
+
+        using const_iterator =  typename InformationMap::const_iterator;
+        const const_iterator end = m_messageInformation.end();
+
+        {
+            size_t i = 0;
+            for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
+                processMap[i]=info->first;
+                if(info->second.second.size_) {
+                    MPI_Irecv(m_GPURecvBuf->data()+info->second.second.start_,
+                            info->second.second.size_,
+                            MPI_BYTE,
+                            info->first,
+                            m_commTag,
+                            this->m_cpuOwnerOverlapCopy.communicator(),
+                            &recvRequests[i]);
+                    numberOfRealRecvRequests += 1;
+                } else {
+                    recvRequests[i]=MPI_REQUEST_NULL;
+                }
+            }
+        }
+
+        {
+            size_t i = 0;
+            for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
+                if(info->second.first.size_) {
+                    MPI_Issend(m_GPUSendBuf->data()+info->second.first.start_,
+                            info->second.first.size_,
+                            MPI_BYTE,
+                            info->first,
+                            m_commTag,
+                            this->m_cpuOwnerOverlapCopy.communicator(),
+                            &sendRequests[i]);
+                } else {
+                    sendRequests[i]=MPI_REQUEST_NULL;
+                }
+            }
+        }
+        int finished = MPI_UNDEFINED;
+        MPI_Status status;
+        for(size_t i = 0; i < numberOfRealRecvRequests; i++) {
+            status.MPI_ERROR=MPI_SUCCESS;
+            MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
+
+            if(status.MPI_ERROR!=MPI_SUCCESS) {
+                std::cerr<< rank << ": MPI_Error occurred while receiving message from "<< processMap[finished] << std::endl;
+                OPM_THROW(std::runtime_error, "MPI_Error while receiving message");
+            }
+        }
+        MPI_Status recvStatus;
+        for(size_t i = 0; i < m_messageInformation.size(); i++) {
+            if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
+                std::cerr << rank << ": MPI_Error occurred while sending message to " << processMap[finished] << std::endl;
+                OPM_THROW(std::runtime_error, "MPI_Error while sending message");
+            }
+        }
+        // ...End of MPI stuff
+
+        dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
+    }
+
+private:
+    mutable std::unique_ptr<CuVector<int>> m_commpairIndicesCopy;
+    mutable std::unique_ptr<CuVector<int>> m_commpairIndicesOwner;
+    mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
+    mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
+
+    struct MessageInformation
+    {
+        MessageInformation() : start_(0), size_(0) {}
+        MessageInformation(size_t start, size_t size) : start_(start), size_(size) {}
+        size_t start_; // offset in elements of "field_type"
+        size_t size_;  // size in bytes
+    };
+
+    using InformationMap = std::map<int,std::pair<MessageInformation,MessageInformation> >;
+    mutable InformationMap m_messageInformation;
+    using IM = std::map<int,std::pair<std::vector<int>,std::vector<int> > >;
+    mutable IM m_im;
+
+    constexpr static int m_commTag = 0; // So says DUNE
+
+    void buildCommPairIdxs() const
+    {
+        auto &ri = this->m_cpuOwnerOverlapCopy.remoteIndices();
+        std::vector<int> commpairIndicesCopyOnCPU;
+        std::vector<int> commpairIndicesOwnerCPU;
+
+        for(auto process : ri) {
+            int size = 0;
+            m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
+            for(int send = 0; send < 2; ++send) {
+                auto remoteEnd = send ? process.second.first->end()
+                                      : process.second.second->end();
+                auto remote = send ? process.second.first->begin()
+                                   : process.second.second->begin();
+
+                while(remote != remoteEnd) {
+                    if (send ? (remote->localIndexPair().local().attribute() == 1)
+                             : (remote->attribute() == 1)) {
+                        ++size;
+                        if (send) {
+                            m_im[process.first].first.push_back(remote->localIndexPair().local().local()); 
+                        } else {
+                            m_im[process.first].second.push_back(remote->localIndexPair().local().local());
+                        }
+                    }
+                    ++remote;
+                }
+            }
+        }
+
+        int sendBufIdx = 0;
+        int recvBufIdx = 0;
+        for (auto it = m_im.begin(); it != m_im.end(); it++) {
+            int noSend = it->second.first.size();
+            int noRecv = it->second.second.size();
+
+            if (noSend + noRecv > 0) {
+                m_messageInformation.insert(
+                        std::make_pair(it->first,
+                                       std::make_pair(MessageInformation(
+                                                        sendBufIdx * block_size,
+                                                        noSend * block_size * sizeof(field_type)),
+                                                      MessageInformation(
+                                                        recvBufIdx * block_size,
+                                                        noRecv * block_size * sizeof(field_type)))));
+
+                for(int x = 0; x < noSend; x++) {
+                    for(int bs = 0; bs < block_size; bs++) {
+                        commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
+                    }
+                }
+                for(int x = 0; x < noRecv; x++) {
+                    for(int bs = 0; bs < block_size; bs++) {
+                        commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
+                    }
+                }
+                sendBufIdx += noSend;
+                recvBufIdx += noRecv;
+            }
+        }
+
+        m_commpairIndicesCopy = std::make_unique<CuVector<int>>(commpairIndicesCopyOnCPU);
+        m_commpairIndicesOwner = std::make_unique<CuVector<int>>(commpairIndicesOwnerCPU);
+
+        m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
+        m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
+    }
+
+    void initIndexSet() const override
+    {
+        // We need indices that we we will use in the project, dot and norm calls.
+        // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
+        const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
+        std::vector<int> indicesCopyOnCPU;
+        std::vector<int> indicesOwnerCPU;
+        for (const auto& index : pis) {
+            if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
+                for (int component = 0; component < block_size; ++component) {
+                    indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
+                }
+            }
+
+            if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
+                for (int component = 0; component < block_size; ++component) {
+                    indicesOwnerCPU.push_back(index.local().local() * block_size + component);
+                }
+            }
+        }
+
+        this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
+        this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);

        buildCommPairIdxs();
    }
 };
+
+/**
+ * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
+ *
+ * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
+ * a Dune::OwnerOverlapCopyCommunication on CuVectors
+ *
+ * @note This currently only has the functionality to parallelize the linear solve.
+ *
+ * @tparam field_type should be a field_type supported by CuVector (double, float)
+ * @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
+ * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
+ */
+template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
+class CuOwnerOverlapCopy
+{
+public:
+    using X = CuVector<field_type>;
+
+    CuOwnerOverlapCopy(std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> sender) : m_sender(sender){}
+
+    void copyOwnerToAll(const X& source, X& dest) const {
+        m_sender->copyOwnerToAll(source, dest);
+    }
+
+    void dot(const X& x, const X& y, field_type& output) const
+    {
+        m_sender->dot(x, y, output);
+    }
+
+    field_type norm(const X& x) const
+    {
+        return m_sender->norm(x);
+    }
+
+    void project(X& x) const
+    {
+        m_sender->project(x);
+    }
+
+private:
+    std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
+};
 } // namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/SolverAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/SolverAdapter.hpp
@ -33,8 +33,6 @@
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/has_function.hpp>

-
-
 namespace Opm::cuistl
 {
 //! @brief Wraps a CUDA solver to work with CPU data.
@ -163,10 +161,31 @@ private:
            auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner();
            const auto& communication = m_opOnCPUWithMatrix.getCommunication();

+            // Temporary solution use the GPU Direct communication solely based on these prepcrosessor statements
+            bool mpiMightBeSupportedDuringCompilation = true;
+            bool mpiMightBeSupportedDuringRuntime = true;
+
+            #if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
+                mpiMightBeSupportedDuringCompilation = false;
+            #endif /* MPIX_CUDA_AWARE_SUPPORT */
+
+            #if defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_Query_cuda_support
+                mpiMightBeSupportedDuringRuntime = false;
+            #endif /* MPIX_CUDA_AWARE_SUPPORT */
+
+            // TODO add typename Operator communication type as a named type with using
+            std::shared_ptr<Opm::cuistl::GPUSender<real_type, typename Operator::communication_type>> gpuComm;
+            if (mpiMightBeSupportedDuringCompilation && mpiMightBeSupportedDuringRuntime){
+                gpuComm = std::make_shared<Opm::cuistl::GPUAwareMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
+            }
+            else{
+                gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
+            }
+
            using CudaCommunication = CuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
            using SchwarzOperator
                = Dune::OverlappingSchwarzOperator<CuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
-            auto cudaCommunication = std::make_shared<CudaCommunication>(communication);
+            auto cudaCommunication = std::make_shared<CudaCommunication>(gpuComm);

            auto mpiPreconditioner = std::make_shared<CuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
                preconditionerReallyOnGPU, cudaCommunication);
--- a/tests/cuistl/test_cuowneroverlapcopy.cpp
+++ b/tests/cuistl/test_cuowneroverlapcopy.cpp
@ -29,7 +29,9 @@
 #include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/cuistl/set_device.hpp>
 #include <random>
+#include <mpi.h>

 bool
 init_unit_test_func()
@ -41,6 +43,10 @@ int
 main(int argc, char** argv)
 {
    [[maybe_unused]] const auto& helper = Dune::MPIHelper::instance(argc, argv);
+    int rank, totalRanks;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &totalRanks);
+    Opm::cuistl::setDevice(rank, totalRanks);
    boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv);
 }

@ -58,8 +64,10 @@ BOOST_AUTO_TEST_CASE(TestProject)
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);

+    auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
+    
    auto cuOwnerOverlapCopy
-        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
+        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);

    cuOwnerOverlapCopy.project(xGPU);

@ -88,8 +96,10 @@ BOOST_AUTO_TEST_CASE(TestDot)
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);

+    auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
+
    auto cuOwnerOverlapCopy
-        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
+        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);

    double outputDune = -1.0;
    auto xDune = xGPU.asDuneBlockVector<1>();
@ -100,5 +110,4 @@ BOOST_AUTO_TEST_CASE(TestDot)


    BOOST_CHECK_EQUAL(outputDune, output);
-    BOOST_CHECK_EQUAL(4.0, output);
 }