From ceb15e22e3d3ade3eb1f306d1db82c7c9b70adc1 Mon Sep 17 00:00:00 2001
From: Kjetil Olsen Lye <kjetil.olsen.lye@sintef.no>
Date: Wed, 31 May 2023 14:51:00 +0200
Subject: [PATCH] Expose CuISTL solver in FlexibleSolver.

---
 CMakeLists.txt                                |   2 +
 CMakeLists_files.cmake                        |  14 +-
 opm/simulators/linalg/FlexibleSolver_impl.hpp |  14 ++
 .../linalg/cuistl/CuBlockPreconditioner.hpp   | 125 ++++++++++
 .../linalg/cuistl/CuOwnerOverlapCopy.hpp      | 149 ++++++++++++
 .../linalg/cuistl/PreconditionerAdapter.hpp   |  11 +-
 .../linalg/cuistl/PreconditionerHolder.hpp    |  43 ++++
 .../linalg/cuistl/SolverAdapter.hpp           | 214 ++++++++++++++++++
 .../linalg/cuistl/detail/has_function.hpp     |  37 +++
 tests/cuistl/test_cuowneroverlapcopy.cpp      | 104 +++++++++
 tests/cuistl/test_solver_adapter.cpp          | 117 ++++++++++
 11 files changed, 822 insertions(+), 8 deletions(-)
 create mode 100644 opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp
 create mode 100644 opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
 create mode 100644 opm/simulators/linalg/cuistl/PreconditionerHolder.hpp
 create mode 100644 opm/simulators/linalg/cuistl/SolverAdapter.hpp
 create mode 100644 tests/cuistl/test_cuowneroverlapcopy.cpp
 create mode 100644 tests/cuistl/test_solver_adapter.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b96d01690..5d3a208b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -552,6 +552,8 @@ if(CUDA_FOUND)
                        cuvector
                        cusparsematrix
                        cuseqilu0
+                       cuowneroverlapcopy
+                       solver_adapter
                        PROPERTIES LABELS gpu_cuda)
 endif()
 
diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 9a76bb4a7..cd5f1b340 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -172,16 +172,13 @@ if(CUDA_FOUND)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/has_function.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp)
-<<<<<<< HEAD
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuSeqILU0.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp)
-=======
-  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp)
-
-
->>>>>>> 05ee1a855 (Added conversion preconditioner.)
-
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp)
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/SolverAdapter.hpp)
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp)
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/PreconditionerHolder.hpp)
 
 endif()
 if(OPENCL_FOUND)
@@ -271,6 +268,9 @@ if(CUDA_FOUND)
   list(APPEND TEST_SOURCE_FILES tests/cuistl/test_safe_conversion.cpp)
   list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuseqilu0.cpp)
   list(APPEND TEST_SOURCE_FILES tests/cuistl/test_converttofloatadapter.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuowneroverlapcopy.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_solver_adapter.cpp)
+
 
 endif()
 if(OPENCL_FOUND)
diff --git a/opm/simulators/linalg/FlexibleSolver_impl.hpp b/opm/simulators/linalg/FlexibleSolver_impl.hpp
index 8c75cc346..a377ef47d 100644
--- a/opm/simulators/linalg/FlexibleSolver_impl.hpp
+++ b/opm/simulators/linalg/FlexibleSolver_impl.hpp
@@ -37,6 +37,10 @@
 #include <dune/istl/owneroverlapcopy.hh>
 #include <dune/istl/paamg/pinfo.hh>
 
+#if HAVE_CUDA
+#include <opm/simulators/linalg/cuistl/SolverAdapter.hpp>
+#endif
+
 namespace Dune
 {
     /// Create a sequential solver.
@@ -184,6 +188,16 @@ namespace Dune
             bool dummy = false;
             using MatrixType = std::remove_const_t<std::remove_reference_t<decltype(linearoperator_for_solver_->getmat())>>;
             linsolver_ = std::make_shared<Dune::UMFPack<MatrixType>>(linearoperator_for_solver_->getmat(), verbosity, dummy);
+#endif
+#if HAVE_CUDA
+        } else if (solver_type == "cubicgstab") {
+            linsolver_.reset(new Opm::cuistl::SolverAdapter<Operator, Dune::BiCGSTABSolver, VectorType>(
+                *linearoperator_for_solver_,
+                scalarproduct_,
+                preconditioner_,
+                tol, // desired residual reduction factor
+                maxiter, // maximum number of iterations
+                verbosity));
 #endif
         } else {
             OPM_THROW(std::invalid_argument,
diff --git a/opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp b/opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp
new file mode 100644
index 000000000..c5cfde3a3
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp
@@ -0,0 +1,125 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_CUISTL_CUBLOCKPRECONDITIONER_HPP
+#define OPM_CUISTL_CUBLOCKPRECONDITIONER_HPP
+
+#include <dune/common/shared_ptr.hh>
+#include <memory>
+#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
+
+namespace Opm::cuistl
+{
+//! @brief Is an adaptation of Dune::BlockPreconditioner that works within the CuISTL framework.
+//!
+//! @note We aim to intgrate this into OwningBlockPreconditioner (or a relative thereof).
+template <class X, class Y, class C, class P = Dune::PreconditionerWithUpdate<X, Y>>
+class CuBlockPreconditioner : public Dune::PreconditionerWithUpdate<X, Y>, public PreconditionerHolder<X, Y>
+{
+public:
+    using domain_type = X;
+    using range_type = Y;
+    using field_type = typename X::field_type;
+    using communication_type = C;
+
+
+    //! @brief Constructor.
+    //!
+    //! constructor gets all parameters to operate the prec.
+    //! @param p The sequential preconditioner.
+    //! @param c The communication object for syncing overlap and copy
+    //! data points. (E.~g. OwnerOverlapCopyCommunication )
+    //!
+    CuBlockPreconditioner(const std::shared_ptr<P>& p, const std::shared_ptr<const communication_type>& c)
+        : m_preconditioner(p)
+        , m_communication(c)
+    {
+    }
+
+    CuBlockPreconditioner(const std::shared_ptr<P>& p, const communication_type& c)
+        : m_preconditioner(p)
+        , m_communication(Dune::stackobject_to_shared_ptr(c))
+    {
+    }
+
+    //! \brief Prepare the preconditioner.
+    //!
+    //! \copydoc Preconditioner::pre(X&,Y&)
+    virtual void pre(X& x, Y& b) override
+    {
+        // TODO: [perf] Do we always need to copy, or only when we need the pre step?
+        m_communication->copyOwnerToAll(x, x); // make Dirichlet values consistent
+        if constexpr (detail::shouldCallPreconditionerPre<P>()) {
+            m_preconditioner->pre(x, b);
+        }
+    }
+
+    //! \brief Apply the preconditioner
+    //!
+    //! \copydoc Preconditioner::apply(X&,const Y&)
+    virtual void apply(X& v, const Y& d) override
+    {
+        m_preconditioner->apply(v, d);
+        m_communication->copyOwnerToAll(v, v);
+    }
+
+
+    virtual void update() override
+    {
+        m_preconditioner->update();
+    }
+
+    virtual void post(X& x) override
+    {
+        if constexpr (detail::shouldCallPreconditionerPost<P>()) {
+            m_preconditioner->post(x);
+        }
+    }
+
+    //! Category of the preconditioner (see SolverCategory::Category)
+    virtual Dune::SolverCategory::Category category() const override
+    {
+        return Dune::SolverCategory::overlapping;
+    }
+
+    static constexpr bool shouldCallPre()
+    {
+        return detail::shouldCallPreconditionerPost<P>();
+    }
+    static constexpr bool shouldCallPost()
+    {
+        return detail::shouldCallPreconditionerPre<P>();
+    }
+
+    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<X, Y>> getUnderlyingPreconditioner() override
+    {
+        return m_preconditioner;
+    }
+
+
+private:
+    //! \brief a sequential preconditioner
+    std::shared_ptr<P> m_preconditioner;
+
+    //! \brief the communication object
+    std::shared_ptr<const communication_type> m_communication;
+};
+} // namespace Opm::cuistl
+#endif
diff --git a/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp b/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
new file mode 100644
index 000000000..a39b98b5e
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
@@ -0,0 +1,149 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
+#define OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
+#include <dune/istl/owneroverlapcopy.hh>
+#include <memory>
+#include <mutex>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+
+namespace Opm::cuistl
+{
+/**
+ * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
+ *
+ * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
+ * a Dune::OwnerOverlapCopyCommunication on CuVectors
+ *
+ *
+ * @note This currently only has the functionality to parallelize the linear solve.
+ *
+ * @tparam field_type should be a field_type supported by CuVector (double, float)
+ * @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
+ * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
+ */
+template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
+class CuOwnerOverlapCopy
+{
+public:
+    using X = CuVector<field_type>;
+
+    CuOwnerOverlapCopy(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
+        : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy)
+    {
+    }
+    /**
+     * @brief dot will carry out the dot product between x and y on the owned indices, then sum up the result across MPI
+     * processes.
+     * @param[out] output result will be stored here
+     *
+     * @note This uses the same interface as its DUNE equivalent.
+     */
+    void dot(const X& x, const X& y, field_type& output) const
+    {
+        std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
+
+        const auto dotAtRank = x.dot(y, *m_indicesOwner);
+        output = m_cpuOwnerOverlapCopy.communicator().sum(dotAtRank);
+    }
+
+    /**
+     * @brief norm computes the l^2-norm of x across processes.
+     *
+     * This will compute the dot product of x with itself on owned indices, then
+     * sum the result across process and return the square root of the sum.
+     */
+    field_type norm(const X& x) const
+    {
+        auto xDotX = field_type(0);
+        this->dot(x, x, xDotX);
+
+        using std::sqrt;
+        return sqrt(xDotX);
+    }
+
+    /**
+     * @brief project will project x to the owned subspace
+     *
+     * For each component i which is not owned, x_i will be set to 0
+     *
+     * @param[inout] x the vector to project
+     */
+    void project(X& x) const
+    {
+        std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
+        x.setZeroAtIndexSet(*m_indicesCopy);
+    }
+
+    /**
+     * @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
+     * the copied data, and copy the result back to the GPU
+     * @param[in] source
+     * @param[out] dest
+     */
+    void copyOwnerToAll(const X& source, X& dest) const
+    {
+        // TODO: [perf] Can we reduce copying from the GPU here?
+        // TODO: [perf] Maybe create a global buffer instead?
+        auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
+        auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
+        m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
+        dest.copyFromHost(destAsDuneVector);
+    }
+
+
+
+private:
+    const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
+
+    // Used to call the initIndexSet. Note that this is kind of a
+    // premature optimization, in the sense that we could just initialize these indices
+    // always, but they are not always used.
+    mutable std::once_flag m_initializedIndices;
+    mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
+    mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
+
+
+    void initIndexSet() const
+    {
+        // We need indices that we we will use in the project, dot and norm calls.
+        // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
+        const auto& pis = m_cpuOwnerOverlapCopy.indexSet();
+        std::vector<int> indicesCopyOnCPU;
+        std::vector<int> indicesOwnerCPU;
+        for (const auto& index : pis) {
+            if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
+                for (int component = 0; component < block_size; ++component) {
+                    indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
+                }
+            }
+
+            if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
+                for (int component = 0; component < block_size; ++component) {
+                    indicesOwnerCPU.push_back(index.local().local() * block_size + component);
+                }
+            }
+        }
+
+        m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
+        m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+    }
+};
+} // namespace Opm::cuistl
+#endif
diff --git a/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp b/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
index 88e83011d..f4f382538 100644
--- a/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
@@ -22,6 +22,7 @@
 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
 
 
@@ -36,7 +37,9 @@ namespace Opm::cuistl
 //! \tparam Y the range type (should be on the CPU). Typicall a Dune::BlockVector
 //! \tparam CudaPreconditionerType the preconditioner taking CuVector<real_type> as arguments to apply
 template <class X, class Y, class CudaPreconditionerType>
-class PreconditionerAdapter : public Dune::PreconditionerWithUpdate<X, Y>
+class PreconditionerAdapter
+    : public Dune::PreconditionerWithUpdate<X, Y>,
+      public PreconditionerHolder<CuVector<typename X::field_type>, CuVector<typename Y::field_type>>
 {
 public:
     //! \brief The domain type of the preconditioner.
@@ -114,6 +117,12 @@ public:
         return detail::shouldCallPreconditionerPre<CudaPreconditionerType>();
     }
 
+    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<CuVector<field_type>, CuVector<field_type>>>
+    getUnderlyingPreconditioner() override
+    {
+        return m_underlyingPreconditioner;
+    }
+
 private:
     //! \brief the underlying preconditioner to use
     std::shared_ptr<CudaPreconditionerType> m_underlyingPreconditioner;
diff --git a/opm/simulators/linalg/cuistl/PreconditionerHolder.hpp b/opm/simulators/linalg/cuistl/PreconditionerHolder.hpp
new file mode 100644
index 000000000..bd0732414
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/PreconditionerHolder.hpp
@@ -0,0 +1,43 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_CUISTL_PRECONDITIONERHOLDER_HPP
+#define OPM_CUISTL_PRECONDITIONERHOLDER_HPP
+#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
+
+
+namespace Opm::cuistl
+{
+//! \brief Common interface for adapters that hold preconditioners.
+//!
+//! \note The goal is that this class will be made useless after further
+//! restructuring of the solver interfaces (FlexibleSolver and friends), but
+//! for this is needed as an intermediate layer. See specifically SolverAdapter.hpp
+//! for how this is used.
+template <class X, class Y>
+class PreconditionerHolder
+{
+public:
+    /**
+     * @brief getUnderlyingPreconditioner gets the underlying preconditioner (preconditioner being held)
+     */
+    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<X, Y>> getUnderlyingPreconditioner() = 0;
+};
+} // end namespace Opm::cuistl
+
+#endif
diff --git a/opm/simulators/linalg/cuistl/SolverAdapter.hpp b/opm/simulators/linalg/cuistl/SolverAdapter.hpp
new file mode 100644
index 000000000..95589015a
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/SolverAdapter.hpp
@@ -0,0 +1,214 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_SOLVERADAPTER_HPP
+#define OPM_SOLVERADAPTER_HPP
+
+#include <memory>
+
+#include <dune/istl/bcrsmatrix.hh>
+#include <dune/istl/operators.hh>
+#include <dune/istl/schwarz.hh>
+#include <dune/istl/solver.hh>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
+#include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/cuistl/detail/has_function.hpp>
+
+
+
+namespace Opm::cuistl
+{
+//! @brief Wraps a CUDA solver to work with CPU data.
+//!
+//! @tparam Operator the Dune::LinearOperator to use
+//! @tparam UnderlyingSolver a Dune solver like class, eg Dune::BiCGSTABSolver
+//! @tparam X the outer type to use (eg. Dune::BlockVector<Dune::FieldVector<...>>)
+template <class Operator, template <class> class UnderlyingSolver, class X>
+class SolverAdapter : public Dune::IterativeSolver<X, X>
+{
+public:
+    using typename Dune::IterativeSolver<X, X>::domain_type;
+    using typename Dune::IterativeSolver<X, X>::range_type;
+    using typename Dune::IterativeSolver<X, X>::field_type;
+    using typename Dune::IterativeSolver<X, X>::real_type;
+    using typename Dune::IterativeSolver<X, X>::scalar_real_type;
+    static constexpr auto block_size = domain_type::block_type::dimension;
+    using XGPU = Opm::cuistl::CuVector<real_type>;
+
+    // TODO: Use a std::forward
+    SolverAdapter(Operator& op,
+                  std::shared_ptr<Dune::ScalarProduct<X>> sp,
+                  std::shared_ptr<Dune::Preconditioner<X, X>> prec,
+                  scalar_real_type reduction,
+                  int maxit,
+                  int verbose)
+        : Dune::IterativeSolver<X, X>(op, *sp, *prec, reduction, maxit, verbose)
+        , m_opOnCPUWithMatrix(op)
+        , m_matrix(CuSparseMatrix<real_type>::fromMatrix(op.getmat()))
+        , m_underlyingSolver(constructSolver(prec, reduction, maxit, verbose))
+    {
+    }
+
+    virtual void apply(X& x, X& b, double reduction, Dune::InverseOperatorResult& res) override
+    {
+        // TODO: Can we do this without reimplementing the other function?
+        // TODO: [perf] Do we need to update the matrix every time? Probably yes
+        m_matrix.updateNonzeroValues(m_opOnCPUWithMatrix.getmat());
+
+        if (!m_inputBuffer) {
+            m_inputBuffer.reset(new XGPU(b.dim()));
+            m_outputBuffer.reset(new XGPU(x.dim()));
+        }
+
+        m_inputBuffer->copyFromHost(b);
+        // TODO: [perf] do we need to copy x here?
+        m_outputBuffer->copyFromHost(x);
+
+        m_underlyingSolver.apply(*m_outputBuffer, *m_inputBuffer, reduction, res);
+
+        // TODO: [perf] do we need to copy b here?
+        m_inputBuffer->copyToHost(b);
+        m_outputBuffer->copyToHost(x);
+    }
+    virtual void apply(X& x, X& b, Dune::InverseOperatorResult& res) override
+    {
+        // TODO: [perf] Do we need to update the matrix every time? Probably yes
+        m_matrix.updateNonzeroValues(m_opOnCPUWithMatrix.getmat());
+
+        if (!m_inputBuffer) {
+            m_inputBuffer.reset(new XGPU(b.dim()));
+            m_outputBuffer.reset(new XGPU(x.dim()));
+        }
+
+        m_inputBuffer->copyFromHost(b);
+        // TODO: [perf] do we need to copy x here?
+        m_outputBuffer->copyFromHost(x);
+
+        m_underlyingSolver.apply(*m_outputBuffer, *m_inputBuffer, res);
+
+        // TODO: [perf] do we need to copy b here?
+        m_inputBuffer->copyToHost(b);
+        m_outputBuffer->copyToHost(x);
+    }
+
+private:
+    Operator& m_opOnCPUWithMatrix;
+    CuSparseMatrix<real_type> m_matrix;
+
+    UnderlyingSolver<XGPU> m_underlyingSolver;
+
+
+    // TODO: Use a std::forward
+    UnderlyingSolver<XGPU> constructSolver(std::shared_ptr<Dune::Preconditioner<X, X>> prec,
+                                           scalar_real_type reduction,
+                                           int maxit,
+                                           int verbose)
+    {
+
+        OPM_ERROR_IF(
+            detail::is_a_well_operator<Operator>::value,
+            "Currently we only support operators of type MatrixAdapter in the CUDA solver. "
+            "Use --matrix-add-well-contributions=true. "
+            "Using WellModelMatrixAdapter with SolverAdapter is not well-defined so it will not work well, or at all.");
+
+
+
+
+
+        if constexpr (detail::has_communication<Operator>::value) {
+            // TODO: See the below TODO over the definition of precHolder in the other branch
+            // TODO: We are currently double wrapping preconditioners in the preconditioner factory to be extra
+            // compatible
+            //       with CPU. Probably a cleaner way eventually would be to do more modifications to the flexible
+            //       solver to accomodate the pure GPU better.
+            auto precAsHolder = std::dynamic_pointer_cast<PreconditionerHolder<X, X>>(prec);
+            if (!precAsHolder) {
+                OPM_THROW(std::invalid_argument,
+                          "The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
+                          "Opm::cuistl::PreconditionerAdapter wrapped in a "
+                          "Opm::cuistl::CuBlockPreconditioner. If you are unsure what this means, set "
+                          "preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
+            }
+
+            auto preconditionerAdapter = precAsHolder->getUnderlyingPreconditioner();
+            auto preconditionerAdapterAsHolder
+                = std::dynamic_pointer_cast<PreconditionerHolder<XGPU, XGPU>>(preconditionerAdapter);
+            if (!preconditionerAdapterAsHolder) {
+                OPM_THROW(std::invalid_argument,
+                          "The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
+                          "Opm::cuistl::PreconditionerAdapter wrapped in a "
+                          "Opm::cuistl::CuBlockPreconditioner. If you are unsure what this means, set "
+                          "preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
+            }
+            // We need to get the underlying preconditioner:
+            auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner();
+            const auto& communication = m_opOnCPUWithMatrix.getCommunication();
+
+            using CudaCommunication = CuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
+            using SchwarzOperator
+                = Dune::OverlappingSchwarzOperator<CuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
+            auto cudaCommunication = std::make_shared<CudaCommunication>(communication);
+
+            auto mpiPreconditioner = std::make_shared<CuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
+                preconditionerReallyOnGPU, cudaCommunication);
+
+            auto scalarProduct = std::make_shared<Dune::ParallelScalarProduct<XGPU, CudaCommunication>>(
+                cudaCommunication, m_opOnCPUWithMatrix.category());
+
+
+            // NOTE: Ownsership of cudaCommunication is handled by mpiPreconditioner. However, just to make sure we
+            // remember
+            //       this, we add this check. So remember that we hold one count in this scope, and one in the
+            //       CuBlockPreconditioner instance. We accomedate for the fact that it could be passed around in
+            //       CuBlockPreconditioner, hence we do not test for != 2
+            OPM_ERROR_IF(cudaCommunication.use_count() < 2, "Internal error. Shared pointer not owned properly.");
+            auto overlappingCudaOperator = std::make_shared<SchwarzOperator>(m_matrix, *cudaCommunication);
+
+            return UnderlyingSolver<XGPU>(
+                overlappingCudaOperator, scalarProduct, mpiPreconditioner, reduction, maxit, verbose);
+        } else {
+            // TODO: Fix the reliance on casting here. This is a code smell to a certain degree, and assumes
+            //       a certain setup beforehand. The only reason we do it this way is to minimize edits to the
+            //       flexible solver. We could design it differently, but keep this for the time being until
+            //       we figure out how we want to GPU-ify the rest of the system.
+            auto precAsHolder = std::dynamic_pointer_cast<PreconditionerHolder<XGPU, XGPU>>(prec);
+            if (!precAsHolder) {
+                OPM_THROW(std::invalid_argument,
+                          "The preconditioner needs to be a CUDA preconditioner wrapped in a "
+                          "Opm::cuistl::PreconditionerHolder (eg. CuILU0).");
+            }
+            auto preconditionerOnGPU = precAsHolder->getUnderlyingPreconditioner();
+
+            auto matrixOperator
+                = std::make_shared<Dune::MatrixAdapter<CuSparseMatrix<real_type>, XGPU, XGPU>>(m_matrix);
+            auto scalarProduct = std::make_shared<Dune::SeqScalarProduct<XGPU>>();
+            return UnderlyingSolver<XGPU>(
+                matrixOperator, scalarProduct, preconditionerOnGPU, reduction, maxit, verbose);
+        }
+    }
+
+    std::unique_ptr<XGPU> m_inputBuffer;
+    std::unique_ptr<XGPU> m_outputBuffer;
+};
+} // namespace Opm::cuistl
+
+#endif
diff --git a/opm/simulators/linalg/cuistl/detail/has_function.hpp b/opm/simulators/linalg/cuistl/detail/has_function.hpp
index 2a7b10501..ebc395d78 100644
--- a/opm/simulators/linalg/cuistl/detail/has_function.hpp
+++ b/opm/simulators/linalg/cuistl/detail/has_function.hpp
@@ -84,5 +84,42 @@ public:
     static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
 };
 
+
+/**
+ * @brief The has_communication class checks if the type has the member function getCommunication.
+ *
+ * This is used in the SolverAdapter to check if the operator has a communication, and it will then select a different
+ * path accordingly.
+ */
+template <typename T>
+class has_communication
+{
+    template <typename U>
+    static std::true_type test(decltype(&U::getCommunication));
+    template <typename U>
+    static std::false_type test(...);
+
+public:
+    static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
+};
+
+/**
+ * @brief The is_a_well_operator class tries to guess if the operator is a well type operator
+ *
+ * @note This is mainly done in the solver adapter to detect incompatible runtime arguments. When the GPU linear solve
+ * paths supports wells, this class can be removed.
+ */
+template <typename T>
+class is_a_well_operator
+{
+    template <typename U>
+    static std::true_type test(decltype(&U::addWellPressureEquations));
+    template <typename U>
+    static std::false_type test(...);
+
+public:
+    static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
+};
+
 } // namespace Opm::cuistl::detail
 #endif
diff --git a/tests/cuistl/test_cuowneroverlapcopy.cpp b/tests/cuistl/test_cuowneroverlapcopy.cpp
new file mode 100644
index 000000000..74e127444
--- /dev/null
+++ b/tests/cuistl/test_cuowneroverlapcopy.cpp
@@ -0,0 +1,104 @@
+/*
+  Copyright 2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+
+#define BOOST_TEST_MODULE TestCuOwnerOverlapCopy
+#define BOOST_TEST_NO_MAIN
+
+#include <boost/test/unit_test.hpp>
+#include <dune/common/parallel/mpihelper.hh>
+#include <dune/istl/bcrsmatrix.hh>
+#include <dune/istl/owneroverlapcopy.hh>
+#include <memory>
+#include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <random>
+
+bool
+init_unit_test_func()
+{
+    return true;
+}
+
+int
+main(int argc, char** argv)
+{
+    [[maybe_unused]] const auto& helper = Dune::MPIHelper::instance(argc, argv);
+    boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv);
+}
+
+BOOST_AUTO_TEST_CASE(TestProject)
+{
+
+    // We're going to have three points: Centered is owned, left and right is copied. We assume a periodic domain
+    // ([0,1]/0~1)
+    auto indexInfo = Dune::IndexInfoFromGrid<int, int>();
+    indexInfo.addLocalIndex(std::make_tuple(0, 0, Dune::OwnerOverlapCopyAttributeSet::copy));
+    indexInfo.addLocalIndex(std::make_tuple(1, 1, Dune::OwnerOverlapCopyAttributeSet::owner));
+    indexInfo.addLocalIndex(std::make_tuple(2, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
+
+    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
+    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
+    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
+
+    auto cuOwnerOverlapCopy
+        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
+
+    cuOwnerOverlapCopy.project(xGPU);
+
+    auto resultOfProject = xGPU.asStdVector();
+
+    BOOST_CHECK_EQUAL(0.0, resultOfProject[0]);
+    BOOST_CHECK_EQUAL(2.0, resultOfProject[1]);
+    BOOST_CHECK_EQUAL(0.0, resultOfProject[2]);
+}
+
+
+BOOST_AUTO_TEST_CASE(TestDot)
+{
+
+    // We're going to have three points: Centered is owned, left and right is copied. We assume a periodic domain
+    // ([0,1]/0~1)
+    auto indexInfo = Dune::IndexInfoFromGrid<int, int>();
+    indexInfo.addLocalIndex(std::make_tuple(0, 0, Dune::OwnerOverlapCopyAttributeSet::copy));
+    indexInfo.addLocalIndex(std::make_tuple(1, 1, Dune::OwnerOverlapCopyAttributeSet::owner));
+    indexInfo.addLocalIndex(std::make_tuple(2, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
+
+    indexInfo.addRemoteIndex(std::make_tuple(0, 0, Dune::OwnerOverlapCopyAttributeSet::copy));
+    indexInfo.addRemoteIndex(std::make_tuple(0, 1, Dune::OwnerOverlapCopyAttributeSet::owner));
+    indexInfo.addRemoteIndex(std::make_tuple(0, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
+    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
+    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
+    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
+
+    auto cuOwnerOverlapCopy
+        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
+
+    double outputDune = -1.0;
+    auto xDune = xGPU.asDuneBlockVector<1>();
+    ownerOverlapCopy.dot(xDune, xDune, outputDune);
+
+    double output = -1.0;
+    cuOwnerOverlapCopy.dot(xGPU, xGPU, output);
+
+
+    BOOST_CHECK_EQUAL(outputDune, output);
+    BOOST_CHECK_EQUAL(4.0, output);
+}
diff --git a/tests/cuistl/test_solver_adapter.cpp b/tests/cuistl/test_solver_adapter.cpp
new file mode 100644
index 000000000..46661d891
--- /dev/null
+++ b/tests/cuistl/test_solver_adapter.cpp
@@ -0,0 +1,117 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+
+#define BOOST_TEST_MODULE TestSolverAdapter
+
+#include <boost/test/unit_test.hpp>
+#include <dune/istl/solvers.hh>
+#include <opm/simulators/linalg/PreconditionerFactory.hpp>
+#include <opm/simulators/linalg/PropertyTree.hpp>
+#include <opm/simulators/linalg/cuistl/SolverAdapter.hpp>
+
+static const constexpr int dim = 3;
+using Matrix = Dune::BCRSMatrix<Dune::FieldMatrix<double, dim, dim>>;
+using Vector = Dune::BlockVector<Dune::FieldVector<double, dim>>;
+using Moperator = Dune::MatrixAdapter<Matrix, Vector, Vector>;
+using PrecondFactory = Opm::PreconditionerFactory<Moperator, Dune::Amg::SequentialInformation>;
+using SolverAdapter = Opm::cuistl::SolverAdapter<Moperator, Dune::BiCGSTABSolver, Vector>;
+
+namespace
+{
+auto
+createSolverAdapterWithMatrix(const size_t N = 10)
+{
+
+
+    const int nonZeroes = N * 3 - 2;
+
+    // We need to hold the matrix in memory somehow, but we don't want to deference
+    // a pointer all the time (quality of life...):
+    auto matrixPtr = std::make_shared<Matrix>(N, N, nonZeroes, Matrix::row_wise);
+    auto& matrix = *matrixPtr;
+    for (auto row = matrix.createbegin(); row != matrix.createend(); ++row) {
+        // Add nonzeros for left neighbour, diagonal and right neighbour
+        if (row.index() > 0) {
+            row.insert(row.index() - 1);
+        }
+        row.insert(row.index());
+        if (row.index() < matrix.N() - 1) {
+            row.insert(row.index() + 1);
+        }
+    }
+    // This might not be the most elegant way of filling in a Dune sparse matrix, but it works.
+    for (size_t i = 0; i < N; ++i) {
+        for (int k = 0; k < dim; ++k) {
+            matrix[i][i][k][k] = -2;
+        }
+        if (i < N - 1) {
+            for (int k = 0; k < dim; ++k) {
+                matrix[i][i + 1][k][k] = 1;
+            }
+        }
+
+        if (i > 0) {
+            for (int k = 0; k < dim; ++k) {
+                matrix[i][i - 1][k][k] = 1;
+            }
+        }
+    }
+    auto op = std::make_shared<Moperator>(matrix);
+    auto sp = std::make_shared<Dune::ScalarProduct<Vector>>();
+    auto prm = Opm::PropertyTree();
+    prm.put<double>("relaxation", 1.0);
+    prm.put<std::string>("type", "CUILU0");
+    auto prec = PrecondFactory::create(*op, prm);
+    auto solverAdapter = std::make_shared<SolverAdapter>(*op, sp, prec, 1.0, 10, 0);
+
+    return std::make_tuple(matrixPtr, solverAdapter, op);
+}
+} // namespace
+
+BOOST_AUTO_TEST_CASE(TestCreation)
+{
+    BOOST_CHECK_NO_THROW(createSolverAdapterWithMatrix(););
+}
+
+BOOST_AUTO_TEST_CASE(TestSolve)
+{
+    const size_t N = 10;
+    auto [matrix, solverAdapter, op] = createSolverAdapterWithMatrix(N);
+
+    Vector xActual(N), xInitial(N), b(N);
+    for (size_t i = 0; i < N; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+            xActual[i][j] = 1.0;
+            xInitial[i][j] = 0.1 * i;
+        }
+    }
+
+    matrix->mv(xActual, b);
+    Dune::InverseOperatorResult res;
+    solverAdapter->apply(xInitial, b, res);
+
+    for (size_t i = 0; i < N; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+            // This should actually be up to rounding exact since ILU is just the inverse
+            // for this matrix.
+            BOOST_CHECK_CLOSE(xActual[i][j], xInitial[i][j], 1e-13);
+        }
+    }
+}