Merge pull request #4636 from kjetilly/cuistl_multigpu

Path to multigpu: Cuistl multigpu
2025-02-25 18:55:30 -06:00 · 2023-06-01 07:23:39 +02:00 · 2023-06-01 07:23:39 +02:00 · ee6edd83a0
commit ee6edd83a0
parent 4faa5e1fe7 e35318b6bb
12 changed files with 839 additions and 4 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -552,6 +552,8 @@ if(CUDA_FOUND)
                       cuvector
                       cusparsematrix
                       cuseqilu0
                       cuowneroverlapcopy
                       solver_adapter
                       PROPERTIES LABELS gpu_cuda)
 endif()
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -175,7 +175,10 @@ if(CUDA_FOUND)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuSeqILU0.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp)
-
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/SolverAdapter.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/PreconditionerHolder.hpp)
 endif()
 if(OPENCL_FOUND)
@ -265,6 +268,9 @@ if(CUDA_FOUND)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_safe_conversion.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuseqilu0.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_converttofloatadapter.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuowneroverlapcopy.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_solver_adapter.cpp)
 endif()
 if(OPENCL_FOUND)
--- a/opm/simulators/linalg/FlexibleSolver_impl.hpp
+++ b/opm/simulators/linalg/FlexibleSolver_impl.hpp
@ -37,6 +37,10 @@
 #include <dune/istl/owneroverlapcopy.hh>
 #include <dune/istl/paamg/pinfo.hh>
 #if HAVE_CUDA
 #include <opm/simulators/linalg/cuistl/SolverAdapter.hpp>
 #endif
 namespace Dune
 {
    /// Create a sequential solver.
@ -167,7 +171,7 @@ namespace Dune
                                                                                  *scalarproduct_,
                                                                                  *preconditioner_,
                                                                                  tol,// desired residual reduction factor
-                                                                                  restart, 
+                                                                                  restart,
                                                                                  maxiter, // maximum number of iterations
                                                                                  verbosity);
        } else if (solver_type == "flexgmres") {
@ -176,7 +180,7 @@ namespace Dune
                                                                                          *scalarproduct_,
                                                                                          *preconditioner_,
                                                                                          tol,// desired residual reduction factor
-                                                                                          restart, 
+                                                                                          restart,
                                                                                          maxiter, // maximum number of iterations
                                                                                          verbosity);
 #if HAVE_SUITESPARSE_UMFPACK
@ -184,6 +188,16 @@ namespace Dune
            bool dummy = false;
            using MatrixType = std::remove_const_t<std::remove_reference_t<decltype(linearoperator_for_solver_->getmat())>>;
            linsolver_ = std::make_shared<Dune::UMFPack<MatrixType>>(linearoperator_for_solver_->getmat(), verbosity, dummy);
 #endif
 #if HAVE_CUDA
        } else if (solver_type == "cubicgstab") {
            linsolver_.reset(new Opm::cuistl::SolverAdapter<Operator, Dune::BiCGSTABSolver, VectorType>(
                *linearoperator_for_solver_,
                *scalarproduct_,
                preconditioner_,
                tol, // desired residual reduction factor
                maxiter, // maximum number of iterations
                verbosity));
 #endif
        } else {
            OPM_THROW(std::invalid_argument,
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -45,6 +45,8 @@
 #include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
 #endif
@ -224,6 +226,19 @@ struct StandardPreconditioners
            return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(op, prm, weightsCalculator, pressureIndex, comm);
          });
        }
 #if HAVE_CUDA
        F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using CuILU0 = typename Opm::cuistl::CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
            auto cuILU0 = std::make_shared<CuILU0>(op.getmat(), w);
            auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
 #endif
    }
--- a/opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp
+++ b/opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp
@ -0,0 +1,125 @@
 /*
  Copyright 2022-2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_CUISTL_CUBLOCKPRECONDITIONER_HPP
 #define OPM_CUISTL_CUBLOCKPRECONDITIONER_HPP
 #include <dune/common/shared_ptr.hh>
 #include <memory>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
 namespace Opm::cuistl
 {
 //! @brief Is an adaptation of Dune::BlockPreconditioner that works within the CuISTL framework.
 //!
 //! @note We aim to intgrate this into OwningBlockPreconditioner (or a relative thereof).
 template <class X, class Y, class C, class P = Dune::PreconditionerWithUpdate<X, Y>>
 class CuBlockPreconditioner : public Dune::PreconditionerWithUpdate<X, Y>, public PreconditionerHolder<X, Y>
 {
 public:
    using domain_type = X;
    using range_type = Y;
    using field_type = typename X::field_type;
    using communication_type = C;
    //! @brief Constructor.
    //!
    //! constructor gets all parameters to operate the prec.
    //! @param p The sequential preconditioner.
    //! @param c The communication object for syncing overlap and copy
    //! data points. (E.~g. OwnerOverlapCopyCommunication )
    //!
    CuBlockPreconditioner(const std::shared_ptr<P>& p, const std::shared_ptr<const communication_type>& c)
        : m_preconditioner(p)
        , m_communication(c)
    {
    }
    CuBlockPreconditioner(const std::shared_ptr<P>& p, const communication_type& c)
        : m_preconditioner(p)
        , m_communication(Dune::stackobject_to_shared_ptr(c))
    {
    }
    //! \brief Prepare the preconditioner.
    //!
    //! \copydoc Preconditioner::pre(X&,Y&)
    virtual void pre(X& x, Y& b) override
    {
        // TODO: [perf] Do we always need to copy, or only when we need the pre step?
        m_communication->copyOwnerToAll(x, x); // make Dirichlet values consistent
        if constexpr (detail::shouldCallPreconditionerPre<P>()) {
            m_preconditioner->pre(x, b);
        }
    }
    //! \brief Apply the preconditioner
    //!
    //! \copydoc Preconditioner::apply(X&,const Y&)
    virtual void apply(X& v, const Y& d) override
    {
        m_preconditioner->apply(v, d);
        m_communication->copyOwnerToAll(v, v);
    }
    virtual void update() override
    {
        m_preconditioner->update();
    }
    virtual void post(X& x) override
    {
        if constexpr (detail::shouldCallPreconditionerPost<P>()) {
            m_preconditioner->post(x);
        }
    }
    //! Category of the preconditioner (see SolverCategory::Category)
    virtual Dune::SolverCategory::Category category() const override
    {
        return Dune::SolverCategory::overlapping;
    }
    static constexpr bool shouldCallPre()
    {
        return detail::shouldCallPreconditionerPost<P>();
    }
    static constexpr bool shouldCallPost()
    {
        return detail::shouldCallPreconditionerPre<P>();
    }
    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<X, Y>> getUnderlyingPreconditioner() override
    {
        return m_preconditioner;
    }
 private:
    //! \brief a sequential preconditioner
    std::shared_ptr<P> m_preconditioner;
    //! \brief the communication object
    std::shared_ptr<const communication_type> m_communication;
 };
 } // namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
+++ b/opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp
@ -0,0 +1,149 @@
 /*
  Copyright 2022-2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
 #define OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
 #include <dune/istl/owneroverlapcopy.hh>
 #include <memory>
 #include <mutex>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 namespace Opm::cuistl
 {
 /**
 * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
 *
 * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
 * a Dune::OwnerOverlapCopyCommunication on CuVectors
 *
 *
 * @note This currently only has the functionality to parallelize the linear solve.
 *
 * @tparam field_type should be a field_type supported by CuVector (double, float)
 * @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
 * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
 */
 template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
 class CuOwnerOverlapCopy
 {
 public:
    using X = CuVector<field_type>;
    CuOwnerOverlapCopy(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
        : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy)
    {
    }
    /**
     * @brief dot will carry out the dot product between x and y on the owned indices, then sum up the result across MPI
     * processes.
     * @param[out] output result will be stored here
     *
     * @note This uses the same interface as its DUNE equivalent.
     */
    void dot(const X& x, const X& y, field_type& output) const
    {
        std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
        const auto dotAtRank = x.dot(y, *m_indicesOwner);
        output = m_cpuOwnerOverlapCopy.communicator().sum(dotAtRank);
    }
    /**
     * @brief norm computes the l^2-norm of x across processes.
     *
     * This will compute the dot product of x with itself on owned indices, then
     * sum the result across process and return the square root of the sum.
     */
    field_type norm(const X& x) const
    {
        auto xDotX = field_type(0);
        this->dot(x, x, xDotX);
        using std::sqrt;
        return sqrt(xDotX);
    }
    /**
     * @brief project will project x to the owned subspace
     *
     * For each component i which is not owned, x_i will be set to 0
     *
     * @param[inout] x the vector to project
     */
    void project(X& x) const
    {
        std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
        x.setZeroAtIndexSet(*m_indicesCopy);
    }
    /**
     * @brief copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToAll on
     * the copied data, and copy the result back to the GPU
     * @param[in] source
     * @param[out] dest
     */
    void copyOwnerToAll(const X& source, X& dest) const
    {
        // TODO: [perf] Can we reduce copying from the GPU here?
        // TODO: [perf] Maybe create a global buffer instead?
        auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
        auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
        m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
        dest.copyFromHost(destAsDuneVector);
    }
 private:
    const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
    // Used to call the initIndexSet. Note that this is kind of a
    // premature optimization, in the sense that we could just initialize these indices
    // always, but they are not always used.
    mutable std::once_flag m_initializedIndices;
    mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
    mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
    void initIndexSet() const
    {
        // We need indices that we we will use in the project, dot and norm calls.
        // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
        const auto& pis = m_cpuOwnerOverlapCopy.indexSet();
        std::vector<int> indicesCopyOnCPU;
        std::vector<int> indicesOwnerCPU;
        for (const auto& index : pis) {
            if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
                for (int component = 0; component < block_size; ++component) {
                    indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
                }
            }
            if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
                for (int component = 0; component < block_size; ++component) {
                    indicesOwnerCPU.push_back(index.local().local() * block_size + component);
                }
            }
        }
        m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
        m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
    }
 };
 } // namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
@ -22,6 +22,7 @@
 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
@ -36,7 +37,9 @@ namespace Opm::cuistl
 //! \tparam Y the range type (should be on the CPU). Typicall a Dune::BlockVector
 //! \tparam CudaPreconditionerType the preconditioner taking CuVector<real_type> as arguments to apply
 template <class X, class Y, class CudaPreconditionerType>
-class PreconditionerAdapter : public Dune::PreconditionerWithUpdate<X, Y>
+class PreconditionerAdapter
    : public Dune::PreconditionerWithUpdate<X, Y>,
      public PreconditionerHolder<CuVector<typename X::field_type>, CuVector<typename Y::field_type>>
 {
 public:
    //! \brief The domain type of the preconditioner.
@ -114,6 +117,12 @@ public:
        return detail::shouldCallPreconditionerPre<CudaPreconditionerType>();
    }
    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<CuVector<field_type>, CuVector<field_type>>>
    getUnderlyingPreconditioner() override
    {
        return m_underlyingPreconditioner;
    }
 private:
    //! \brief the underlying preconditioner to use
    std::shared_ptr<CudaPreconditionerType> m_underlyingPreconditioner;
--- a/opm/simulators/linalg/cuistl/PreconditionerHolder.hpp
+++ b/opm/simulators/linalg/cuistl/PreconditionerHolder.hpp
@ -0,0 +1,43 @@
 /*
  Copyright 2022-2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_CUISTL_PRECONDITIONERHOLDER_HPP
 #define OPM_CUISTL_PRECONDITIONERHOLDER_HPP
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 namespace Opm::cuistl
 {
 //! \brief Common interface for adapters that hold preconditioners.
 //!
 //! \note The goal is that this class will be made useless after further
 //! restructuring of the solver interfaces (FlexibleSolver and friends), but
 //! for this is needed as an intermediate layer. See specifically SolverAdapter.hpp
 //! for how this is used.
 template <class X, class Y>
 class PreconditionerHolder
 {
 public:
    /**
     * @brief getUnderlyingPreconditioner gets the underlying preconditioner (preconditioner being held)
     */
    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<X, Y>> getUnderlyingPreconditioner() = 0;
 };
 } // end namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/SolverAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/SolverAdapter.hpp
@ -0,0 +1,214 @@
 /*
  Copyright 2022-2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_SOLVERADAPTER_HPP
 #define OPM_SOLVERADAPTER_HPP
 #include <memory>
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/operators.hh>
 #include <dune/istl/schwarz.hh>
 #include <dune/istl/solver.hh>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
 #include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/has_function.hpp>
 namespace Opm::cuistl
 {
 //! @brief Wraps a CUDA solver to work with CPU data.
 //!
 //! @tparam Operator the Dune::LinearOperator to use
 //! @tparam UnderlyingSolver a Dune solver like class, eg Dune::BiCGSTABSolver
 //! @tparam X the outer type to use (eg. Dune::BlockVector<Dune::FieldVector<...>>)
 template <class Operator, template <class> class UnderlyingSolver, class X>
 class SolverAdapter : public Dune::IterativeSolver<X, X>
 {
 public:
    using typename Dune::IterativeSolver<X, X>::domain_type;
    using typename Dune::IterativeSolver<X, X>::range_type;
    using typename Dune::IterativeSolver<X, X>::field_type;
    using typename Dune::IterativeSolver<X, X>::real_type;
    using typename Dune::IterativeSolver<X, X>::scalar_real_type;
    static constexpr auto block_size = domain_type::block_type::dimension;
    using XGPU = Opm::cuistl::CuVector<real_type>;
    // TODO: Use a std::forward
    SolverAdapter(Operator& op,
                  Dune::ScalarProduct<X>& sp,
                  std::shared_ptr<Dune::Preconditioner<X, X>> prec,
                  scalar_real_type reduction,
                  int maxit,
                  int verbose)
        : Dune::IterativeSolver<X, X>(op, sp, *prec, reduction, maxit, verbose)
        , m_opOnCPUWithMatrix(op)
        , m_matrix(CuSparseMatrix<real_type>::fromMatrix(op.getmat()))
        , m_underlyingSolver(constructSolver(prec, reduction, maxit, verbose))
    {
    }
    virtual void apply(X& x, X& b, double reduction, Dune::InverseOperatorResult& res) override
    {
        // TODO: Can we do this without reimplementing the other function?
        // TODO: [perf] Do we need to update the matrix every time? Probably yes
        m_matrix.updateNonzeroValues(m_opOnCPUWithMatrix.getmat());
        if (!m_inputBuffer) {
            m_inputBuffer.reset(new XGPU(b.dim()));
            m_outputBuffer.reset(new XGPU(x.dim()));
        }
        m_inputBuffer->copyFromHost(b);
        // TODO: [perf] do we need to copy x here?
        m_outputBuffer->copyFromHost(x);
        m_underlyingSolver.apply(*m_outputBuffer, *m_inputBuffer, reduction, res);
        // TODO: [perf] do we need to copy b here?
        m_inputBuffer->copyToHost(b);
        m_outputBuffer->copyToHost(x);
    }
    virtual void apply(X& x, X& b, Dune::InverseOperatorResult& res) override
    {
        // TODO: [perf] Do we need to update the matrix every time? Probably yes
        m_matrix.updateNonzeroValues(m_opOnCPUWithMatrix.getmat());
        if (!m_inputBuffer) {
            m_inputBuffer.reset(new XGPU(b.dim()));
            m_outputBuffer.reset(new XGPU(x.dim()));
        }
        m_inputBuffer->copyFromHost(b);
        // TODO: [perf] do we need to copy x here?
        m_outputBuffer->copyFromHost(x);
        m_underlyingSolver.apply(*m_outputBuffer, *m_inputBuffer, res);
        // TODO: [perf] do we need to copy b here?
        m_inputBuffer->copyToHost(b);
        m_outputBuffer->copyToHost(x);
    }
 private:
    Operator& m_opOnCPUWithMatrix;
    CuSparseMatrix<real_type> m_matrix;
    UnderlyingSolver<XGPU> m_underlyingSolver;
    // TODO: Use a std::forward
    UnderlyingSolver<XGPU> constructSolver(std::shared_ptr<Dune::Preconditioner<X, X>> prec,
                                           scalar_real_type reduction,
                                           int maxit,
                                           int verbose)
    {
        OPM_ERROR_IF(
            detail::is_a_well_operator<Operator>::value,
            "Currently we only support operators of type MatrixAdapter in the CUDA solver. "
            "Use --matrix-add-well-contributions=true. "
            "Using WellModelMatrixAdapter with SolverAdapter is not well-defined so it will not work well, or at all.");
        if constexpr (detail::has_communication<Operator>::value) {
            // TODO: See the below TODO over the definition of precHolder in the other branch
            // TODO: We are currently double wrapping preconditioners in the preconditioner factory to be extra
            // compatible
            //       with CPU. Probably a cleaner way eventually would be to do more modifications to the flexible
            //       solver to accomodate the pure GPU better.
            auto precAsHolder = std::dynamic_pointer_cast<PreconditionerHolder<X, X>>(prec);
            if (!precAsHolder) {
                OPM_THROW(std::invalid_argument,
                          "The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
                          "Opm::cuistl::PreconditionerAdapter wrapped in a "
                          "Opm::cuistl::CuBlockPreconditioner. If you are unsure what this means, set "
                          "preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
            }
            auto preconditionerAdapter = precAsHolder->getUnderlyingPreconditioner();
            auto preconditionerAdapterAsHolder
                = std::dynamic_pointer_cast<PreconditionerHolder<XGPU, XGPU>>(preconditionerAdapter);
            if (!preconditionerAdapterAsHolder) {
                OPM_THROW(std::invalid_argument,
                          "The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
                          "Opm::cuistl::PreconditionerAdapter wrapped in a "
                          "Opm::cuistl::CuBlockPreconditioner. If you are unsure what this means, set "
                          "preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
            }
            // We need to get the underlying preconditioner:
            auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner();
            const auto& communication = m_opOnCPUWithMatrix.getCommunication();
            using CudaCommunication = CuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
            using SchwarzOperator
                = Dune::OverlappingSchwarzOperator<CuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
            auto cudaCommunication = std::make_shared<CudaCommunication>(communication);
            auto mpiPreconditioner = std::make_shared<CuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
                preconditionerReallyOnGPU, cudaCommunication);
            auto scalarProduct = std::make_shared<Dune::ParallelScalarProduct<XGPU, CudaCommunication>>(
                cudaCommunication, m_opOnCPUWithMatrix.category());
            // NOTE: Ownsership of cudaCommunication is handled by mpiPreconditioner. However, just to make sure we
            // remember
            //       this, we add this check. So remember that we hold one count in this scope, and one in the
            //       CuBlockPreconditioner instance. We accomedate for the fact that it could be passed around in
            //       CuBlockPreconditioner, hence we do not test for != 2
            OPM_ERROR_IF(cudaCommunication.use_count() < 2, "Internal error. Shared pointer not owned properly.");
            auto overlappingCudaOperator = std::make_shared<SchwarzOperator>(m_matrix, *cudaCommunication);
            return UnderlyingSolver<XGPU>(
                overlappingCudaOperator, scalarProduct, mpiPreconditioner, reduction, maxit, verbose);
        } else {
            // TODO: Fix the reliance on casting here. This is a code smell to a certain degree, and assumes
            //       a certain setup beforehand. The only reason we do it this way is to minimize edits to the
            //       flexible solver. We could design it differently, but keep this for the time being until
            //       we figure out how we want to GPU-ify the rest of the system.
            auto precAsHolder = std::dynamic_pointer_cast<PreconditionerHolder<XGPU, XGPU>>(prec);
            if (!precAsHolder) {
                OPM_THROW(std::invalid_argument,
                          "The preconditioner needs to be a CUDA preconditioner wrapped in a "
                          "Opm::cuistl::PreconditionerHolder (eg. CuILU0).");
            }
            auto preconditionerOnGPU = precAsHolder->getUnderlyingPreconditioner();
            auto matrixOperator
                = std::make_shared<Dune::MatrixAdapter<CuSparseMatrix<real_type>, XGPU, XGPU>>(m_matrix);
            auto scalarProduct = std::make_shared<Dune::SeqScalarProduct<XGPU>>();
            return UnderlyingSolver<XGPU>(
                matrixOperator, scalarProduct, preconditionerOnGPU, reduction, maxit, verbose);
        }
    }
    std::unique_ptr<XGPU> m_inputBuffer;
    std::unique_ptr<XGPU> m_outputBuffer;
 };
 } // namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/detail/has_function.hpp
+++ b/opm/simulators/linalg/cuistl/detail/has_function.hpp
@ -84,5 +84,42 @@ public:
    static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
 };
 /**
 * @brief The has_communication class checks if the type has the member function getCommunication.
 *
 * This is used in the SolverAdapter to check if the operator has a communication, and it will then select a different
 * path accordingly.
 */
 template <typename T>
 class has_communication
 {
    template <typename U>
    static std::true_type test(decltype(&U::getCommunication));
    template <typename U>
    static std::false_type test(...);
 public:
    static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
 };
 /**
 * @brief The is_a_well_operator class tries to guess if the operator is a well type operator
 *
 * @note This is mainly done in the solver adapter to detect incompatible runtime arguments. When the GPU linear solve
 * paths supports wells, this class can be removed.
 */
 template <typename T>
 class is_a_well_operator
 {
    template <typename U>
    static std::true_type test(decltype(&U::addWellPressureEquations));
    template <typename U>
    static std::false_type test(...);
 public:
    static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
 };
 } // namespace Opm::cuistl::detail
 #endif
--- a/tests/cuistl/test_cuowneroverlapcopy.cpp
+++ b/tests/cuistl/test_cuowneroverlapcopy.cpp
@ -0,0 +1,104 @@
 /*
  Copyright 2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #define BOOST_TEST_MODULE TestCuOwnerOverlapCopy
 #define BOOST_TEST_NO_MAIN
 #include <boost/test/unit_test.hpp>
 #include <dune/common/parallel/mpihelper.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/owneroverlapcopy.hh>
 #include <memory>
 #include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <random>
 bool
 init_unit_test_func()
 {
    return true;
 }
 int
 main(int argc, char** argv)
 {
    [[maybe_unused]] const auto& helper = Dune::MPIHelper::instance(argc, argv);
    boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv);
 }
 BOOST_AUTO_TEST_CASE(TestProject)
 {
    // We're going to have three points: Centered is owned, left and right is copied. We assume a periodic domain
    // ([0,1]/0~1)
    auto indexInfo = Dune::IndexInfoFromGrid<int, int>();
    indexInfo.addLocalIndex(std::make_tuple(0, 0, Dune::OwnerOverlapCopyAttributeSet::copy));
    indexInfo.addLocalIndex(std::make_tuple(1, 1, Dune::OwnerOverlapCopyAttributeSet::owner));
    indexInfo.addLocalIndex(std::make_tuple(2, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
    auto cuOwnerOverlapCopy
        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
    cuOwnerOverlapCopy.project(xGPU);
    auto resultOfProject = xGPU.asStdVector();
    BOOST_CHECK_EQUAL(0.0, resultOfProject[0]);
    BOOST_CHECK_EQUAL(2.0, resultOfProject[1]);
    BOOST_CHECK_EQUAL(0.0, resultOfProject[2]);
 }
 BOOST_AUTO_TEST_CASE(TestDot)
 {
    // We're going to have three points: Centered is owned, left and right is copied. We assume a periodic domain
    // ([0,1]/0~1)
    auto indexInfo = Dune::IndexInfoFromGrid<int, int>();
    indexInfo.addLocalIndex(std::make_tuple(0, 0, Dune::OwnerOverlapCopyAttributeSet::copy));
    indexInfo.addLocalIndex(std::make_tuple(1, 1, Dune::OwnerOverlapCopyAttributeSet::owner));
    indexInfo.addLocalIndex(std::make_tuple(2, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
    indexInfo.addRemoteIndex(std::make_tuple(0, 0, Dune::OwnerOverlapCopyAttributeSet::copy));
    indexInfo.addRemoteIndex(std::make_tuple(0, 1, Dune::OwnerOverlapCopyAttributeSet::owner));
    indexInfo.addRemoteIndex(std::make_tuple(0, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
    auto cuOwnerOverlapCopy
        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(ownerOverlapCopy);
    double outputDune = -1.0;
    auto xDune = xGPU.asDuneBlockVector<1>();
    ownerOverlapCopy.dot(xDune, xDune, outputDune);
    double output = -1.0;
    cuOwnerOverlapCopy.dot(xGPU, xGPU, output);
    BOOST_CHECK_EQUAL(outputDune, output);
    BOOST_CHECK_EQUAL(4.0, output);
 }
--- a/tests/cuistl/test_solver_adapter.cpp
+++ b/tests/cuistl/test_solver_adapter.cpp
@ -0,0 +1,117 @@
 /*
  Copyright 2022-2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #define BOOST_TEST_MODULE TestSolverAdapter
 #include <boost/test/unit_test.hpp>
 #include <dune/istl/solvers.hh>
 #include <opm/simulators/linalg/PreconditionerFactory.hpp>
 #include <opm/simulators/linalg/PropertyTree.hpp>
 #include <opm/simulators/linalg/cuistl/SolverAdapter.hpp>
 static const constexpr int dim = 3;
 using Matrix = Dune::BCRSMatrix<Dune::FieldMatrix<double, dim, dim>>;
 using Vector = Dune::BlockVector<Dune::FieldVector<double, dim>>;
 using Moperator = Dune::MatrixAdapter<Matrix, Vector, Vector>;
 using PrecondFactory = Opm::PreconditionerFactory<Moperator, Dune::Amg::SequentialInformation>;
 using SolverAdapter = Opm::cuistl::SolverAdapter<Moperator, Dune::BiCGSTABSolver, Vector>;
 namespace
 {
 auto
 createSolverAdapterWithMatrix(const size_t N = 10)
 {
    const int nonZeroes = N * 3 - 2;
    // We need to hold the matrix in memory somehow, but we don't want to deference
    // a pointer all the time (quality of life...):
    auto matrixPtr = std::make_shared<Matrix>(N, N, nonZeroes, Matrix::row_wise);
    auto& matrix = *matrixPtr;
    for (auto row = matrix.createbegin(); row != matrix.createend(); ++row) {
        // Add nonzeros for left neighbour, diagonal and right neighbour
        if (row.index() > 0) {
            row.insert(row.index() - 1);
        }
        row.insert(row.index());
        if (row.index() < matrix.N() - 1) {
            row.insert(row.index() + 1);
        }
    }
    // This might not be the most elegant way of filling in a Dune sparse matrix, but it works.
    for (size_t i = 0; i < N; ++i) {
        for (int k = 0; k < dim; ++k) {
            matrix[i][i][k][k] = -2;
        }
        if (i < N - 1) {
            for (int k = 0; k < dim; ++k) {
                matrix[i][i + 1][k][k] = 1;
            }
        }
        if (i > 0) {
            for (int k = 0; k < dim; ++k) {
                matrix[i][i - 1][k][k] = 1;
            }
        }
    }
    auto op = std::make_shared<Moperator>(matrix);
    auto sp = std::make_shared<Dune::ScalarProduct<Vector>>();
    auto prm = Opm::PropertyTree();
    prm.put<double>("relaxation", 1.0);
    prm.put<std::string>("type", "CUILU0");
    auto prec = PrecondFactory::create(*op, prm);
    auto solverAdapter = std::make_shared<SolverAdapter>(*op, *sp, prec, 1.0, 10, 0);
    return std::make_tuple(matrixPtr, solverAdapter, op, sp);
 }
 } // namespace
 BOOST_AUTO_TEST_CASE(TestCreation)
 {
    BOOST_CHECK_NO_THROW(createSolverAdapterWithMatrix(););
 }
 BOOST_AUTO_TEST_CASE(TestSolve)
 {
    const size_t N = 10;
    auto [matrix, solverAdapter, op, sp] = createSolverAdapterWithMatrix(N);
    Vector xActual(N), xInitial(N), b(N);
    for (size_t i = 0; i < N; ++i) {
        for (size_t j = 0; j < dim; ++j) {
            xActual[i][j] = 1.0;
            xInitial[i][j] = 0.1 * i;
        }
    }
    matrix->mv(xActual, b);
    Dune::InverseOperatorResult res;
    solverAdapter->apply(xInitial, b, res);
    for (size_t i = 0; i < N; ++i) {
        for (size_t j = 0; j < dim; ++j) {
            // This should actually be up to rounding exact since ILU is just the inverse
            // for this matrix.
            BOOST_CHECK_CLOSE(xActual[i][j], xInitial[i][j], 1e-13);
        }
    }
 }