refactor cuvector

2025-02-25 18:55:30 -06:00 · 2024-08-22 15:20:20 +02:00 · 2024-08-22 15:20:20 +02:00 · fba1858f42
commit fba1858f42
parent 3aa1767548
28 changed files with 234 additions and 234 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -553,7 +553,7 @@ if(CUDA_FOUND)
                       cusparse_handle
                       cuSparse_matrix_operations
                       cuVector_operations
-                       cuvector
+                       GpuVector
                       GpuSparseMatrix
                       GpuSeqILU0
                       GpuOwnerOverlapCopy
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -213,7 +213,7 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuSparseMatrix.cpp)
@ -240,7 +240,7 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuDILU.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuJac.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuVector.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuSparseMatrix.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuMatrixDescription.hpp)
@ -401,7 +401,7 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cusparse_handle.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuSparse_matrix_operations.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuSparseMatrix.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuvector.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuVector.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuVector_operations.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_safe_conversion.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_solver_adapter.cpp)
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -326,7 +326,7 @@ struct StandardPreconditioners {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using GpuILU0 = typename gpuistl::
-                GpuSeqILU0<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+                GpuSeqILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            auto gpuILU0 = std::make_shared<GpuILU0>(op.getmat(), w);

            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuILU0>>(gpuILU0);
@ -338,7 +338,7 @@ struct StandardPreconditioners {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using GpuJac =
-                typename gpuistl::GpuJac<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+                typename gpuistl::GpuJac<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            auto gpuJac = std::make_shared<GpuJac>(op.getmat(), w);

            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuJac>>(gpuJac);
@ -350,7 +350,7 @@ struct StandardPreconditioners {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+            using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);

            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
@ -362,7 +362,7 @@ struct StandardPreconditioners {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using OpmCuILU0 = typename gpuistl::OpmCuILU0<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+            using OpmCuILU0 = typename gpuistl::OpmCuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            auto cuilu0 = std::make_shared<OpmCuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);

            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
@ -586,7 +586,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using GpuuILU0 = typename gpuistl::
-                GpuSeqILU0<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+                GpuSeqILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuuILU0>>(
                std::make_shared<GpuuILU0>(op.getmat(), w));
        });
@ -598,7 +598,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            using matrix_type_to =
                typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
            using GpuuILU0 = typename gpuistl::
-                GpuSeqILU0<matrix_type_to, gpuistl::CuVector<float>, gpuistl::CuVector<float>>;
+                GpuSeqILU0<matrix_type_to, gpuistl::GpuVector<float>, gpuistl::GpuVector<float>>;
            using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuuILU0>;
            using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
@ -611,7 +611,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
            using GPUJac =
-                typename gpuistl::GpuJac<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+                typename gpuistl::GpuJac<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUJac>>(
                std::make_shared<GPUJac>(op.getmat(), w));
        });
@ -620,7 +620,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using CUILU0 = typename gpuistl::OpmCuILU0<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+            using CUILU0 = typename gpuistl::OpmCuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;

            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
        });
@ -629,7 +629,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::CuVector<field_type>, gpuistl::CuVector<field_type>>;
+            using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels));
        });

@ -639,7 +639,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            using block_type = typename V::block_type;
            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
            using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
-            using GpuDILU = typename gpuistl::GpuDILU<matrix_type_to, gpuistl::CuVector<float>, gpuistl::CuVector<float>>;
+            using GpuDILU = typename gpuistl::GpuDILU<matrix_type_to, gpuistl::GpuVector<float>, gpuistl::GpuVector<float>>;
            using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
            using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
--- a/opm/simulators/linalg/cuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/GpuDILU.cpp
@ -28,7 +28,7 @@
 #include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
 #include <opm/simulators/linalg/cuistl/GpuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
@ -71,7 +71,7 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels)
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
-        m_gpuMatrixReorderedDiag = std::make_unique<CuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
+        m_gpuMatrixReorderedDiag = std::make_unique<GpuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
            = detail::extractLowerAndUpperMatrices<M, field_type, GpuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
@ -278,8 +278,8 @@ GpuDILU<M, X, Y, l>::tuneThreadBlockSizes()
    m_DILUFactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing DILU factorization");

    // tune the thread-block size of the apply
-    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
-    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
@ -296,11 +296,11 @@ GpuDILU<M, X, Y, l>::tuneThreadBlockSizes()
 } // namespace Opm::gpuistl
 #define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
    template class ::Opm::gpuistl::GpuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,            \
-                                         ::Opm::gpuistl::CuVector<realtype>,                                            \
-                                         ::Opm::gpuistl::CuVector<realtype>>;                                           \
+                                         ::Opm::gpuistl::GpuVector<realtype>,                                            \
+                                         ::Opm::gpuistl::GpuVector<realtype>>;                                           \
    template class ::Opm::gpuistl::GpuDILU<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,             \
-                                         ::Opm::gpuistl::CuVector<realtype>,                                            \
-                                         ::Opm::gpuistl::CuVector<realtype>>
+                                         ::Opm::gpuistl::GpuVector<realtype>,                                            \
+                                         ::Opm::gpuistl::GpuVector<realtype>>

 INSTANTIATE_CUDILU_DUNE(double, 1);
 INSTANTIATE_CUDILU_DUNE(double, 2);
--- a/opm/simulators/linalg/cuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/cuistl/GpuDILU.hpp
@ -37,7 +37,7 @@ namespace Opm::gpuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
 class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
@ -126,13 +126,13 @@ private:
    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
    //! \brief If matrix splitting is enabled, we also store the diagonal separately
-    std::unique_ptr<CuVector<field_type>> m_gpuMatrixReorderedDiag;
+    std::unique_ptr<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
    //! row conversion from natural to reordered matrix indices stored on the GPU
-    CuVector<int> m_gpuNaturalToReorder;
+    GpuVector<int> m_gpuNaturalToReorder;
    //! row conversion from reordered to natural matrix indices stored on the GPU
-    CuVector<int> m_gpuReorderToNatural;
+    GpuVector<int> m_gpuReorderToNatural;
    //! \brief Stores the inverted diagonal that we use in DILU
-    CuVector<field_type> m_gpuDInv;
+    GpuVector<field_type> m_gpuDInv;
    //! \brief Bool storing whether or not we should store matrices in a split format
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
--- a/opm/simulators/linalg/cuistl/GpuJac.cpp
+++ b/opm/simulators/linalg/cuistl/GpuJac.cpp
@ -21,7 +21,7 @@
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/GpuJac.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
@ -111,11 +111,11 @@ GpuJac<M, X, Y, l>::invertDiagonalAndFlatten()
 } // namespace Opm::gpuistl
 #define INSTANTIATE_CUJAC_DUNE(realtype, blockdim)                                                                     \
    template class ::Opm::gpuistl::GpuJac<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,             \
-                                        ::Opm::gpuistl::CuVector<realtype>,                                             \
-                                        ::Opm::gpuistl::CuVector<realtype>>;                                            \
+                                        ::Opm::gpuistl::GpuVector<realtype>,                                             \
+                                        ::Opm::gpuistl::GpuVector<realtype>>;                                            \
    template class ::Opm::gpuistl::GpuJac<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,              \
-                                        ::Opm::gpuistl::CuVector<realtype>,                                             \
-                                        ::Opm::gpuistl::CuVector<realtype>>
+                                        ::Opm::gpuistl::GpuVector<realtype>,                                             \
+                                        ::Opm::gpuistl::GpuVector<realtype>>

 INSTANTIATE_CUJAC_DUNE(double, 1);
 INSTANTIATE_CUJAC_DUNE(double, 2);
--- a/opm/simulators/linalg/cuistl/GpuJac.hpp
+++ b/opm/simulators/linalg/cuistl/GpuJac.hpp
@ -40,7 +40,7 @@ namespace Opm::gpuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
 class GpuJac : public Dune::PreconditionerWithUpdate<X, Y>
@ -106,7 +106,7 @@ private:
    //! \brief The A matrix stored on the gpu
    GpuSparseMatrix<field_type> m_gpuMatrix;
    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
-    CuVector<field_type> m_diagInvFlattened;
+    GpuVector<field_type> m_diagInvFlattened;

    void invertDiagonalAndFlatten();
 };
--- a/opm/simulators/linalg/cuistl/GpuOwnerOverlapCopy.hpp
+++ b/opm/simulators/linalg/cuistl/GpuOwnerOverlapCopy.hpp
@ -21,7 +21,7 @@
 #include <dune/istl/owneroverlapcopy.hh>
 #include <memory>
 #include <mutex>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <vector>

 namespace Opm::gpuistl
@ -36,7 +36,7 @@ namespace Opm::gpuistl
 template<class field_type, class OwnerOverlapCopyCommunicationType>
 class GPUSender {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}

@ -97,8 +97,8 @@ protected:
    // premature optimization, in the sense that we could just initialize these indices
    // always, but they are not always used.
    mutable std::once_flag m_initializedIndices;
-    mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
-    mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
+    mutable std::unique_ptr<GpuVector<int>> m_indicesOwner;
+    mutable std::unique_ptr<GpuVector<int>> m_indicesCopy;
    const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
 };

@ -113,7 +113,7 @@ template <class field_type, int block_size, class OwnerOverlapCopyCommunicationT
 class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
 {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
        : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
@ -151,8 +151,8 @@ private:
            }
        }

-        this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
-        this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+        this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
+        this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
    }
 };

@ -168,7 +168,7 @@ template <class field_type, int block_size, class OwnerOverlapCopyCommunicationT
 class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
 {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
        : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
@ -178,7 +178,7 @@ public:
    void copyOwnerToAll(const X& source, X& dest) const override
    {

-        OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!!
+        OPM_ERROR_IF(&source != &dest, "The provided GpuVectors' address did not match"); // In this context, source == dest!!!
        std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });

        int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
@ -251,10 +251,10 @@ public:
    }

 private:
-    mutable std::unique_ptr<CuVector<int>> m_commpairIndicesCopy;
-    mutable std::unique_ptr<CuVector<int>> m_commpairIndicesOwner;
-    mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
-    mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
+    mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesCopy;
+    mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesOwner;
+    mutable std::unique_ptr<GpuVector<field_type>> m_GPUSendBuf;
+    mutable std::unique_ptr<GpuVector<field_type>> m_GPURecvBuf;

    struct MessageInformation
    {
@ -332,11 +332,11 @@ private:
            }
        }

-        m_commpairIndicesCopy = std::make_unique<CuVector<int>>(commpairIndicesCopyOnCPU);
-        m_commpairIndicesOwner = std::make_unique<CuVector<int>>(commpairIndicesOwnerCPU);
+        m_commpairIndicesCopy = std::make_unique<GpuVector<int>>(commpairIndicesCopyOnCPU);
+        m_commpairIndicesOwner = std::make_unique<GpuVector<int>>(commpairIndicesOwnerCPU);

-        m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
-        m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
+        m_GPUSendBuf = std::make_unique<GpuVector<field_type>>(sendBufIdx * block_size);
+        m_GPURecvBuf = std::make_unique<GpuVector<field_type>>(recvBufIdx * block_size);
    }

    void initIndexSet() const override
@ -360,8 +360,8 @@ private:
            }
        }

-        this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
-        this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+        this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
+        this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);

        buildCommPairIdxs();
    }
@ -371,11 +371,11 @@ private:
 * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
 *
 * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
- * a Dune::OwnerOverlapCopyCommunication on CuVectors
+ * a Dune::OwnerOverlapCopyCommunication on GpuVectors
 *
 * @note This currently only has the functionality to parallelize the linear solve.
 *
- * @tparam field_type should be a field_type supported by CuVector (double, float)
+ * @tparam field_type should be a field_type supported by GpuVector (double, float)
 * @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
 * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
 */
@ -383,7 +383,7 @@ template <class field_type, int block_size, class OwnerOverlapCopyCommunicationT
 class GpuOwnerOverlapCopy
 {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GpuOwnerOverlapCopy(std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> sender) : m_sender(sender){}

--- a/opm/simulators/linalg/cuistl/GpuSeqILU0.cpp
+++ b/opm/simulators/linalg/cuistl/GpuSeqILU0.cpp
@ -332,7 +332,7 @@ GpuSeqILU0<M, X, Y, l>::updateILUConfiguration()
 {
    auto bufferSize = findBufferSize();
    if (!m_buffer || m_buffer->dim() < bufferSize) {
-        m_buffer.reset(new CuVector<field_type>((bufferSize + sizeof(field_type) - 1) / sizeof(field_type)));
+        m_buffer.reset(new GpuVector<field_type>((bufferSize + sizeof(field_type) - 1) / sizeof(field_type)));
    }
    analyzeMatrix();
    createILU();
@ -340,11 +340,11 @@ GpuSeqILU0<M, X, Y, l>::updateILUConfiguration()
 } // namespace Opm::gpuistl
 #define INSTANTIATE_GPUSEQILU0_DUNE(realtype, blockdim)                                                                 \
    template class ::Opm::gpuistl::GpuSeqILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
-                                            ::Opm::gpuistl::CuVector<realtype>,                                         \
-                                            ::Opm::gpuistl::CuVector<realtype>>;                                        \
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>;                                        \
    template class ::Opm::gpuistl::GpuSeqILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
-                                            ::Opm::gpuistl::CuVector<realtype>,                                         \
-                                            ::Opm::gpuistl::CuVector<realtype>>
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>


 INSTANTIATE_GPUSEQILU0_DUNE(double, 1);
--- a/opm/simulators/linalg/cuistl/GpuSeqILU0.hpp
+++ b/opm/simulators/linalg/cuistl/GpuSeqILU0.hpp
@ -43,7 +43,7 @@ namespace Opm::gpuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
 class GpuSeqILU0 : public Dune::PreconditionerWithUpdate<X, Y>
@ -112,7 +112,7 @@ private:
    //! modified in the constructor to be the proper LU decomposition.
    GpuSparseMatrix<field_type> m_LU;

-    CuVector<field_type> m_temporaryStorage;
+    GpuVector<field_type> m_temporaryStorage;


    detail::GpuSparseMatrixDescriptionPtr m_descriptionL;
@ -121,7 +121,7 @@ private:
    detail::CuSparseResource<bsrsv2Info_t> m_infoU;
    detail::CuSparseResource<bsrilu02Info_t> m_infoM;

-    std::unique_ptr<CuVector<field_type>> m_buffer;
+    std::unique_ptr<GpuVector<field_type>> m_buffer;
    detail::CuSparseHandle& m_cuSparseHandle;

    bool m_analysisDone = false;
--- a/opm/simulators/linalg/cuistl/GpuSparseMatrix.cpp
+++ b/opm/simulators/linalg/cuistl/GpuSparseMatrix.cpp
@ -198,7 +198,7 @@ GpuSparseMatrix<T>::setNonUnitDiagonal()

 template <typename T>
 void
-GpuSparseMatrix<T>::mv(const CuVector<T>& x, CuVector<T>& y) const
+GpuSparseMatrix<T>::mv(const GpuVector<T>& x, GpuVector<T>& y) const
 {
    assertSameSize(x);
    assertSameSize(y);
@ -232,7 +232,7 @@ GpuSparseMatrix<T>::mv(const CuVector<T>& x, CuVector<T>& y) const

 template <typename T>
 void
-GpuSparseMatrix<T>::umv(const CuVector<T>& x, CuVector<T>& y) const
+GpuSparseMatrix<T>::umv(const GpuVector<T>& x, GpuVector<T>& y) const
 {
    assertSameSize(x);
    assertSameSize(y);
@ -267,7 +267,7 @@ GpuSparseMatrix<T>::umv(const CuVector<T>& x, CuVector<T>& y) const

 template <typename T>
 void
-GpuSparseMatrix<T>::usmv(T alpha, const CuVector<T>& x, CuVector<T>& y) const
+GpuSparseMatrix<T>::usmv(T alpha, const GpuVector<T>& x, GpuVector<T>& y) const
 {
    assertSameSize(x);
    assertSameSize(y);
--- a/opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp
+++ b/opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp
@ -22,7 +22,7 @@
 #include <iostream>
 #include <memory>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
 #include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
 #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
@ -144,7 +144,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    CuVector<T>& getNonZeroValues()
+    GpuVector<T>& getNonZeroValues()
    {
        return m_nonZeroElements;
    }
@ -154,7 +154,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    const CuVector<T>& getNonZeroValues() const
+    const GpuVector<T>& getNonZeroValues() const
    {
        return m_nonZeroElements;
    }
@ -164,7 +164,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    CuVector<int>& getRowIndices()
+    GpuVector<int>& getRowIndices()
    {
        return m_rowIndices;
    }
@ -174,7 +174,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    const CuVector<int>& getRowIndices() const
+    const GpuVector<int>& getRowIndices() const
    {
        return m_rowIndices;
    }
@ -184,7 +184,7 @@ public:
     *
     * @return Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    CuVector<int>& getColumnIndices()
+    GpuVector<int>& getColumnIndices()
    {
        return m_columnIndices;
    }
@ -194,7 +194,7 @@ public:
     *
     * @return Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    const CuVector<int>& getColumnIndices() const
+    const GpuVector<int>& getColumnIndices() const
    {
        return m_columnIndices;
    }
@ -245,7 +245,7 @@ public:
     *
     * @note Due to limitations of CuSparse, this is only supported for block sizes greater than 1.
     */
-    virtual void mv(const CuVector<T>& x, CuVector<T>& y) const;
+    virtual void mv(const GpuVector<T>& x, GpuVector<T>& y) const;

    /**
     * @brief umv computes y=Ax+y
@ -254,7 +254,7 @@ public:
     *
     * @note Due to limitations of CuSparse, this is only supported for block sizes greater than 1.
     */
-    virtual void umv(const CuVector<T>& x, CuVector<T>& y) const;
+    virtual void umv(const GpuVector<T>& x, GpuVector<T>& y) const;


    /**
@ -264,7 +264,7 @@ public:
     *
     * @note Due to limitations of CuSparse, this is only supported for block sizes greater than 1.
     */
-    virtual void usmv(T alpha, const CuVector<T>& x, CuVector<T>& y) const;
+    virtual void usmv(T alpha, const GpuVector<T>& x, GpuVector<T>& y) const;

    /**
     * @brief updateNonzeroValues updates the non-zero values by using the non-zero values of the supplied matrix
@ -280,9 +280,9 @@ public:
    void updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false);

 private:
-    CuVector<T> m_nonZeroElements;
-    CuVector<int> m_columnIndices;
-    CuVector<int> m_rowIndices;
+    GpuVector<T> m_nonZeroElements;
+    GpuVector<int> m_columnIndices;
+    GpuVector<int> m_rowIndices;

    // Notice that we store these three as int to make sure we are cusparse compatible.
    //
--- a/opm/simulators/linalg/cuistl/GpuVector.cpp
+++ b/opm/simulators/linalg/cuistl/GpuVector.cpp
@ -20,7 +20,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <fmt/core.h>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
@ -30,13 +30,13 @@ namespace Opm::gpuistl
 {

 template <class T>
-CuVector<T>::CuVector(const std::vector<T>& data)
-    : CuVector(data.data(), detail::to_int(data.size()))
+GpuVector<T>::GpuVector(const std::vector<T>& data)
+    : GpuVector(data.data(), detail::to_int(data.size()))
 {
 }

 template <class T>
-CuVector<T>::CuVector(const size_t numberOfElements)
+GpuVector<T>::GpuVector(const size_t numberOfElements)
    : m_numberOfElements(detail::to_int(numberOfElements))
    , m_cuBlasHandle(detail::CuBlasHandle::getInstance())
 {
@ -44,8 +44,8 @@ CuVector<T>::CuVector(const size_t numberOfElements)
 }

 template <class T>
-CuVector<T>::CuVector(const T* dataOnHost, const size_t numberOfElements)
-    : CuVector(numberOfElements)
+GpuVector<T>::GpuVector(const T* dataOnHost, const size_t numberOfElements)
+    : GpuVector(numberOfElements)
 {

    OPM_CUDA_SAFE_CALL(cudaMemcpy(
@ -53,8 +53,8 @@ CuVector<T>::CuVector(const T* dataOnHost, const size_t numberOfElements)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator=(T scalar)
+GpuVector<T>&
+GpuVector<T>::operator=(T scalar)
 {
    assertHasElements();
    detail::setVectorValue(data(), detail::to_size_t(m_numberOfElements), scalar);
@ -62,8 +62,8 @@ CuVector<T>::operator=(T scalar)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator=(const CuVector<T>& other)
+GpuVector<T>&
+GpuVector<T>::operator=(const GpuVector<T>& other)
 {
    assertHasElements();
    assertSameSize(other);
@ -76,8 +76,8 @@ CuVector<T>::operator=(const CuVector<T>& other)
 }

 template <class T>
-CuVector<T>::CuVector(const CuVector<T>& other)
-    : CuVector(other.m_numberOfElements)
+GpuVector<T>::GpuVector(const GpuVector<T>& other)
+    : GpuVector(other.m_numberOfElements)
 {
    assertHasElements();
    assertSameSize(other);
@ -88,21 +88,21 @@ CuVector<T>::CuVector(const CuVector<T>& other)
 }

 template <class T>
-CuVector<T>::~CuVector()
+GpuVector<T>::~GpuVector()
 {
    OPM_CUDA_WARN_IF_ERROR(cudaFree(m_dataOnDevice));
 }

 template <typename T>
 const T*
-CuVector<T>::data() const
+GpuVector<T>::data() const
 {
    return m_dataOnDevice;
 }

 template <typename T>
-typename CuVector<T>::size_type
-CuVector<T>::dim() const
+typename GpuVector<T>::size_type
+GpuVector<T>::dim() const
 {
    // Note that there is no way for m_numberOfElements to be non-positive,
    // but for sanity we still use the safe conversion function here.
@ -114,7 +114,7 @@ CuVector<T>::dim() const

 template <typename T>
 std::vector<T>
-CuVector<T>::asStdVector() const
+GpuVector<T>::asStdVector() const
 {
    std::vector<T> temporary(detail::to_size_t(m_numberOfElements));
    copyToHost(temporary);
@ -123,21 +123,21 @@ CuVector<T>::asStdVector() const

 template <typename T>
 void
-CuVector<T>::setZeroAtIndexSet(const CuVector<int>& indexSet)
+GpuVector<T>::setZeroAtIndexSet(const GpuVector<int>& indexSet)
 {
    detail::setZeroAtIndexSet(m_dataOnDevice, indexSet.dim(), indexSet.data());
 }

 template <typename T>
 void
-CuVector<T>::assertSameSize(const CuVector<T>& x) const
+GpuVector<T>::assertSameSize(const GpuVector<T>& x) const
 {
    assertSameSize(x.m_numberOfElements);
 }

 template <typename T>
 void
-CuVector<T>::assertSameSize(int size) const
+GpuVector<T>::assertSameSize(int size) const
 {
    if (size != m_numberOfElements) {
        OPM_THROW(std::invalid_argument,
@ -147,7 +147,7 @@ CuVector<T>::assertSameSize(int size) const

 template <typename T>
 void
-CuVector<T>::assertHasElements() const
+GpuVector<T>::assertHasElements() const
 {
    if (m_numberOfElements <= 0) {
        OPM_THROW(std::invalid_argument, "We have 0 elements");
@ -156,14 +156,14 @@ CuVector<T>::assertHasElements() const

 template <typename T>
 T*
-CuVector<T>::data()
+GpuVector<T>::data()
 {
    return m_dataOnDevice;
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator*=(const T& scalar)
+GpuVector<T>&
+GpuVector<T>::operator*=(const T& scalar)
 {
    assertHasElements();
    OPM_CUBLAS_SAFE_CALL(detail::cublasScal(m_cuBlasHandle.get(), m_numberOfElements, &scalar, data(), 1));
@ -171,8 +171,8 @@ CuVector<T>::operator*=(const T& scalar)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::axpy(T alpha, const CuVector<T>& y)
+GpuVector<T>&
+GpuVector<T>::axpy(T alpha, const GpuVector<T>& y)
 {
    assertHasElements();
    assertSameSize(y);
@ -182,7 +182,7 @@ CuVector<T>::axpy(T alpha, const CuVector<T>& y)

 template <class T>
 T
-CuVector<T>::dot(const CuVector<T>& other) const
+GpuVector<T>::dot(const GpuVector<T>& other) const
 {
    assertHasElements();
    assertSameSize(other);
@ -193,7 +193,7 @@ CuVector<T>::dot(const CuVector<T>& other) const
 }
 template <class T>
 T
-CuVector<T>::two_norm() const
+GpuVector<T>::two_norm() const
 {
    assertHasElements();
    T result = T(0);
@ -203,14 +203,14 @@ CuVector<T>::two_norm() const

 template <typename T>
 T
-CuVector<T>::dot(const CuVector<T>& other, const CuVector<int>& indexSet, CuVector<T>& buffer) const
+GpuVector<T>::dot(const GpuVector<T>& other, const GpuVector<int>& indexSet, GpuVector<T>& buffer) const
 {
    return detail::innerProductAtIndices(m_cuBlasHandle.get(), m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data());
 }

 template <typename T>
 T
-CuVector<T>::two_norm(const CuVector<int>& indexSet, CuVector<T>& buffer) const
+GpuVector<T>::two_norm(const GpuVector<int>& indexSet, GpuVector<T>& buffer) const
 {
    // TODO: [perf] Optimize this to a single call
    return std::sqrt(this->dot(*this, indexSet, buffer));
@ -218,23 +218,23 @@ CuVector<T>::two_norm(const CuVector<int>& indexSet, CuVector<T>& buffer) const

 template <typename T>
 T
-CuVector<T>::dot(const CuVector<T>& other, const CuVector<int>& indexSet) const
+GpuVector<T>::dot(const GpuVector<T>& other, const GpuVector<int>& indexSet) const
 {
-    CuVector<T> buffer(indexSet.dim());
+    GpuVector<T> buffer(indexSet.dim());
    return detail::innerProductAtIndices(m_cuBlasHandle.get(), m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data());
 }

 template <typename T>
 T
-CuVector<T>::two_norm(const CuVector<int>& indexSet) const
+GpuVector<T>::two_norm(const GpuVector<int>& indexSet) const
 {
-    CuVector<T> buffer(indexSet.dim());
+    GpuVector<T> buffer(indexSet.dim());
    // TODO: [perf] Optimize this to a single call
    return std::sqrt(this->dot(*this, indexSet, buffer));
 }
 template <class T>
-CuVector<T>&
-CuVector<T>::operator+=(const CuVector<T>& other)
+GpuVector<T>&
+GpuVector<T>::operator+=(const GpuVector<T>& other)
 {
    assertHasElements();
    assertSameSize(other);
@ -243,8 +243,8 @@ CuVector<T>::operator+=(const CuVector<T>& other)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator-=(const CuVector<T>& other)
+GpuVector<T>&
+GpuVector<T>::operator-=(const GpuVector<T>& other)
 {
    assertHasElements();
    assertSameSize(other);
@ -255,7 +255,7 @@ CuVector<T>::operator-=(const CuVector<T>& other)

 template <class T>
 void
-CuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
 {
    if (numberOfElements > dim()) {
        OPM_THROW(std::runtime_error,
@ -268,7 +268,7 @@ CuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)

 template <class T>
 void
-CuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
+GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
 {
    assertSameSize(detail::to_int(numberOfElements));
    OPM_CUDA_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
@ -276,32 +276,32 @@ CuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const

 template <class T>
 void
-CuVector<T>::copyFromHost(const std::vector<T>& data)
+GpuVector<T>::copyFromHost(const std::vector<T>& data)
 {
    copyFromHost(data.data(), data.size());
 }
 template <class T>
 void
-CuVector<T>::copyToHost(std::vector<T>& data) const
+GpuVector<T>::copyToHost(std::vector<T>& data) const
 {
    copyToHost(data.data(), data.size());
 }

 template <typename T>
 void
-CuVector<T>::prepareSendBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const
+GpuVector<T>::prepareSendBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const
 {
    return detail::prepareSendBuf(m_dataOnDevice, buffer.data(), indexSet.dim(), indexSet.data());
 }
 template <typename T>
 void
-CuVector<T>::syncFromRecvBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const
+GpuVector<T>::syncFromRecvBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const
 {
    return detail::syncFromRecvBuf(m_dataOnDevice, buffer.data(), indexSet.dim(), indexSet.data());
 }

-template class CuVector<double>;
-template class CuVector<float>;
-template class CuVector<int>;
+template class GpuVector<double>;
+template class GpuVector<float>;
+template class GpuVector<int>;

 } // namespace Opm::gpuistl
--- a/opm/simulators/linalg/cuistl/GpuVector.hpp
+++ b/opm/simulators/linalg/cuistl/GpuVector.hpp
@ -16,8 +16,8 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUVECTOR_HEADER_HPP
-#define OPM_CUVECTOR_HEADER_HPP
+#ifndef OPM_GPUVECTOR_HEADER_HPP
+#define OPM_GPUVECTOR_HEADER_HPP
 #include <dune/common/fvector.hh>
 #include <dune/istl/bvector.hh>
 #include <exception>
@ -33,7 +33,7 @@ namespace Opm::gpuistl
 {

 /**
- * @brief The CuVector class is a simple (arithmetic) vector class for the GPU.
+ * @brief The GpuVector class is a simple (arithmetic) vector class for the GPU.
 *
 * @note we currently only support simple raw primitives for T (double, float and int)
 *
@ -45,12 +45,12 @@ namespace Opm::gpuistl
 * Example usage:
 *
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+ * #include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 *
 * void someFunction() {
 *     auto someDataOnCPU = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
 *
- *     auto dataOnGPU = CuVector<double>(someDataOnCPU);
+ *     auto dataOnGPU = GpuVector<double>(someDataOnCPU);
 *
 *     // Multiply by 4.0:
 *     dataOnGPU *= 4.0;
@ -62,7 +62,7 @@ namespace Opm::gpuistl
 * @tparam T the type to store. Can be either float, double or int.
 */
 template <typename T>
-class CuVector
+class GpuVector
 {
 public:
    using field_type = T;
@ -70,17 +70,17 @@ public:


    /**
-     * @brief CuVector allocates new GPU memory of the same size as other and copies the content of the other vector to
+     * @brief GpuVector allocates new GPU memory of the same size as other and copies the content of the other vector to
     * this newly allocated memory.
     *
     * @note This does synchronous transfer.
     *
     * @param other the vector to copy from
     */
-    CuVector(const CuVector<T>& other);
+    GpuVector(const GpuVector<T>& other);

    /**
-     * @brief CuVector allocates new GPU memory of the same size as data and copies the content of the data vector to
+     * @brief GpuVector allocates new GPU memory of the same size as data and copies the content of the data vector to
     * this newly allocated memory.
     *
     * @note This does CPU to GPU transfer.
@ -90,7 +90,7 @@ public:
     *
     * @param data the vector to copy from
     */
-    explicit CuVector(const std::vector<T>& data);
+    explicit GpuVector(const std::vector<T>& data);

    /**
     * @brief operator= copies the content of the data vector to the memory of this vector.
@ -100,7 +100,7 @@ public:
     *
     * @param other the vector to copy from
     */
-    CuVector& operator=(const CuVector<T>& other);
+    GpuVector& operator=(const GpuVector<T>& other);

    /**
     * @brief operator= sets the whole vector equal to the scalar value.
@ -109,20 +109,20 @@ public:
     *
     * @param scalar the value all elements will be set to.
     */
-    CuVector& operator=(T scalar);
+    GpuVector& operator=(T scalar);

    /**
-     * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T)
+     * @brief GpuVector allocates new GPU memory of size numberOfElements * sizeof(T)
     *
     * @note For now numberOfElements needs to be within the limits of int due to restrictions in cublas
     *
     * @param numberOfElements number of T elements to allocate
     */
-    explicit CuVector(const size_t numberOfElements);
+    explicit GpuVector(const size_t numberOfElements);


    /**
-     * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
+     * @brief GpuVector allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
     * data
     *
     * @note This assumes the data is on the CPU.
@ -132,12 +132,12 @@ public:
     *
     * @note For now numberOfElements needs to be within the limits of int due to restrictions in cublas
     */
-    CuVector(const T* dataOnHost, const size_t numberOfElements);
+    GpuVector(const T* dataOnHost, const size_t numberOfElements);

    /**
-     * @brief ~CuVector calls cudaFree
+     * @brief ~GpuVector calls cudaFree
     */
-    virtual ~CuVector();
+    virtual ~GpuVector();

    /**
     * @return the raw pointer to the GPU data
@ -162,7 +162,7 @@ public:
        // TODO: [perf] vector.dim() can be replaced by bvector.N() * BlockDimension
        if (detail::to_size_t(m_numberOfElements) != bvector.dim()) {
            OPM_THROW(std::runtime_error,
-                      fmt::format("Given incompatible vector size. CuVector has size {}, \n"
+                      fmt::format("Given incompatible vector size. GpuVector has size {}, \n"
                                  "however, BlockVector has N() = {}, and dim = {}.",
                                  m_numberOfElements,
                                  bvector.N(),
@ -185,7 +185,7 @@ public:
        // TODO: [perf] vector.dim() can be replaced by bvector.N() * BlockDimension
        if (detail::to_size_t(m_numberOfElements) != bvector.dim()) {
            OPM_THROW(std::runtime_error,
-                      fmt::format("Given incompatible vector size. CuVector has size {},\n however, the BlockVector "
+                      fmt::format("Given incompatible vector size. GpuVector has size {},\n however, the BlockVector "
                                  "has has N() = {}, and dim() = {}.",
                                  m_numberOfElements,
                                  bvector.N(),
@ -231,8 +231,8 @@ public:
     */
    void copyToHost(std::vector<T>& data) const;

-    void prepareSendBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const;
-    void syncFromRecvBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const;
+    void prepareSendBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const;
+    void syncFromRecvBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const;

    /**
     * @brief operator *= multiplies every element by scalar
@ -242,7 +242,7 @@ public:
     *
     * @note int is not supported
     */
-    CuVector<T>& operator*=(const T& scalar);
+    GpuVector<T>& operator*=(const T& scalar);

    /**
     * @brief axpy sets this vector equal to this + alha * y
@ -252,7 +252,7 @@ public:
     * @note this will call CuBlas in the background
     * @note int is not supported
     */
-    CuVector<T>& axpy(T alpha, const CuVector<T>& y);
+    GpuVector<T>& axpy(T alpha, const GpuVector<T>& y);

    /**
     * @brief operator+= adds the other vector to this vector
@ -260,7 +260,7 @@ public:
     * @note this will call CuBlas in the background
     * @note int is not supported
     */
-    CuVector<T>& operator+=(const CuVector<T>& other);
+    GpuVector<T>& operator+=(const GpuVector<T>& other);

    /**
     * @brief operator-= subtracts the other vector from this vector
@ -268,7 +268,7 @@ public:
     * @note this will call CuBlas in the background
     * @note int is not supported
     */
-    CuVector<T>& operator-=(const CuVector<T>& other);
+    GpuVector<T>& operator-=(const GpuVector<T>& other);

    /**
     * @brief dot computes the dot product (standard inner product) against the other vector
@ -278,7 +278,7 @@ public:
     *
     * @return the result on the inner product
     */
-    T dot(const CuVector<T>& other) const;
+    T dot(const GpuVector<T>& other) const;

    /**
     * @brief returns the l2 norm of the vector
@ -294,14 +294,14 @@ public:
     *
     * @note int is not supported
     */
-    T dot(const CuVector<T>& other, const CuVector<int>& indexSet, CuVector<T>& buffer) const;
+    T dot(const GpuVector<T>& other, const GpuVector<int>& indexSet, GpuVector<T>& buffer) const;

    /**
     * Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]])
     *
     * @note int is not supported
     */
-    T two_norm(const CuVector<int>& indexSet, CuVector<T>& buffer) const;
+    T two_norm(const GpuVector<int>& indexSet, GpuVector<T>& buffer) const;


    /**
@ -309,14 +309,14 @@ public:
     *
     * @note int is not supported
     */
-    T dot(const CuVector<T>& other, const CuVector<int>& indexSet) const;
+    T dot(const GpuVector<T>& other, const GpuVector<int>& indexSet) const;

    /**
     * Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]])
     *
     * @note int is not supported
     */
-    T two_norm(const CuVector<int>& indexSet) const;
+    T two_norm(const GpuVector<int>& indexSet) const;


    /**
@ -363,9 +363,9 @@ public:
     * }
     * @endcode
     */
-    void setZeroAtIndexSet(const CuVector<int>& indexSet);
+    void setZeroAtIndexSet(const GpuVector<int>& indexSet);

-    // Slow method that creates a string representation of a CuVector for debug purposes
+    // Slow method that creates a string representation of a GpuVector for debug purposes
    std::string toDebugString()
    {
        std::vector<T> v = asStdVector();
@ -385,7 +385,7 @@ private:
    const int m_numberOfElements;
    detail::CuBlasHandle& m_cuBlasHandle;

-    void assertSameSize(const CuVector<T>& other) const;
+    void assertSameSize(const GpuVector<T>& other) const;
    void assertSameSize(int size) const;

    void assertHasElements() const;
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
@ -27,7 +27,7 @@
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
 #include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
 #include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
@ -72,7 +72,7 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
-        m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
+        m_gpuMatrixReorderedDiag.emplace(GpuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
            = detail::extractLowerAndUpperMatrices<M, field_type, GpuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
@ -272,8 +272,8 @@ OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
        = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing ILU0 factorization");

    // tune the thread-block size of the apply
-    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
-    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize) {
@ -292,11 +292,11 @@ OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
 } // namespace Opm::gpuistl
 #define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
    template class ::Opm::gpuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
-                                            ::Opm::gpuistl::CuVector<realtype>,                                         \
-                                            ::Opm::gpuistl::CuVector<realtype>>;                                        \
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>;                                        \
    template class ::Opm::gpuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
-                                            ::Opm::gpuistl::CuVector<realtype>,                                         \
-                                            ::Opm::gpuistl::CuVector<realtype>>
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>

 INSTANTIATE_CUDILU_DUNE(double, 1);
 INSTANTIATE_CUDILU_DUNE(double, 2);
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
@ -23,7 +23,7 @@
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <optional>
 #include <type_traits>
 #include <vector>
@ -39,7 +39,7 @@ namespace Opm::gpuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
 class OpmCuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
@ -126,13 +126,13 @@ private:
    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
    //! \brief If matrix splitting is enabled, we also store the diagonal separately
-    std::optional<CuVector<field_type>> m_gpuMatrixReorderedDiag;
+    std::optional<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
    //! row conversion from natural to reordered matrix indices stored on the GPU
-    CuVector<int> m_gpuNaturalToReorder;
+    GpuVector<int> m_gpuNaturalToReorder;
    //! row conversion from reordered to natural matrix indices stored on the GPU
-    CuVector<int> m_gpuReorderToNatural;
+    GpuVector<int> m_gpuReorderToNatural;
    //! \brief Stores the inverted diagonal that we use in ILU0
-    CuVector<field_type> m_gpuDInv;
+    GpuVector<field_type> m_gpuDInv;
    //! \brief Bool storing whether or not we should store matrices in a split format
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
--- a/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
@ -21,7 +21,7 @@
 #include <cusparse.h>
 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>

@ -35,11 +35,11 @@ namespace Opm::gpuistl
 //!
 //! \tparam X the domain type (should be on the CPU). Typicall a Dune::BlockVector
 //! \tparam Y the range type (should be on the CPU). Typicall a Dune::BlockVector
-//! \tparam CudaPreconditionerType the preconditioner taking CuVector<real_type> as arguments to apply
+//! \tparam CudaPreconditionerType the preconditioner taking GpuVector<real_type> as arguments to apply
 template <class X, class Y, class CudaPreconditionerType>
 class PreconditionerAdapter
    : public Dune::PreconditionerWithUpdate<X, Y>,
-      public PreconditionerHolder<CuVector<typename X::field_type>, CuVector<typename Y::field_type>>
+      public PreconditionerHolder<GpuVector<typename X::field_type>, GpuVector<typename Y::field_type>>
 {
 public:
    //! \brief The domain type of the preconditioner.
@ -77,8 +77,8 @@ public:
    virtual void apply(X& v, const Y& d) override
    {
        if (!m_inputBuffer) {
-            m_inputBuffer.reset(new CuVector<field_type>(v.dim()));
-            m_outputBuffer.reset(new CuVector<field_type>(v.dim()));
+            m_inputBuffer.reset(new GpuVector<field_type>(v.dim()));
+            m_outputBuffer.reset(new GpuVector<field_type>(v.dim()));
        }
        m_inputBuffer->copyFromHost(d);
        m_underlyingPreconditioner->apply(*m_outputBuffer, *m_inputBuffer);
@ -117,7 +117,7 @@ public:
        return detail::shouldCallPreconditionerPre<CudaPreconditionerType>();
    }

-    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<CuVector<field_type>, CuVector<field_type>>>
+    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<GpuVector<field_type>, GpuVector<field_type>>>
    getUnderlyingPreconditioner() override
    {
        return m_underlyingPreconditioner;
@ -131,8 +131,8 @@ private:
    //! \brief the underlying preconditioner to use
    std::shared_ptr<CudaPreconditionerType> m_underlyingPreconditioner;

-    std::unique_ptr<CuVector<field_type>> m_inputBuffer;
-    std::unique_ptr<CuVector<field_type>> m_outputBuffer;
+    std::unique_ptr<GpuVector<field_type>> m_inputBuffer;
+    std::unique_ptr<GpuVector<field_type>> m_outputBuffer;
 };
 } // end namespace Opm::gpuistl

--- a/opm/simulators/linalg/cuistl/SolverAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/SolverAdapter.hpp
@ -29,7 +29,7 @@
 #include <opm/simulators/linalg/cuistl/GpuBlockPreconditioner.hpp>
 #include <opm/simulators/linalg/cuistl/GpuOwnerOverlapCopy.hpp>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/has_function.hpp>

@ -56,7 +56,7 @@ public:
    using typename Dune::IterativeSolver<X, X>::real_type;
    using typename Dune::IterativeSolver<X, X>::scalar_real_type;
    static constexpr auto block_size = domain_type::block_type::dimension;
-    using XGPU = Opm::gpuistl::CuVector<real_type>;
+    using XGPU = Opm::gpuistl::GpuVector<real_type>;

    // TODO: Use a std::forward
    SolverAdapter(Operator& op,
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@ -18,7 +18,7 @@
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
@ -143,7 +143,7 @@ innerProductAtIndices(cublasHandle_t cublasHandle,
    elementWiseMultiplyKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, deviceB, buffer, numberOfElements, indices);

    // TODO: [perf] Get rid of the allocation here.
-    CuVector<T> oneVector(numberOfElements);
+    GpuVector<T> oneVector(numberOfElements);
    oneVector = 1.0;
    T result = 0.0;
    OPM_CUBLAS_SAFE_CALL(cublasDot(cublasHandle, numberOfElements, oneVector.data(), 1, buffer, 1, &result));
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
@ -65,11 +65,11 @@ void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int*
 /**
 * @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
 * compute the Hadamard product.
- * @param squareBlockVector A CuVector whose elements are NxN matrix blocks
+ * @param squareBlockVector A GpuVector whose elements are NxN matrix blocks
 * @param numberOfRows The number of rows in the vector
 * @param blocksize The sidelength of the square block elements in the vector
- * @param src_vec A pointer to the data of the CuVector we multiply the blockvector with
- * @param[out] dst_vec A pointer to the data of the CuVector we store the result in
+ * @param src_vec A pointer to the data of the GpuVector we multiply the blockvector with
+ * @param[out] dst_vec A pointer to the data of the GpuVector we store the result in
 *
 * @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the
 * diagonal of the matrix and use this product.
--- a/tests/cuistl/test_GpuDILU.cpp
+++ b/tests/cuistl/test_GpuDILU.cpp
@ -27,7 +27,7 @@
 #include <opm/simulators/linalg/DILU.hpp>
 #include <opm/simulators/linalg/cuistl/GpuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <random>
@ -42,8 +42,8 @@ using B2x2Vec = Dune::BlockVector<Dune::FieldVector<double, 2>>;
 using Sp1x1BlockMatrix = Dune::BCRSMatrix<FM1x1>;
 using Sp2x2BlockMatrix = Dune::BCRSMatrix<FM2x2>;
 using CuMatrix = Opm::gpuistl::GpuSparseMatrix<T>;
-using CuIntVec = Opm::gpuistl::CuVector<int>;
-using CuFloatingPointVec = Opm::gpuistl::CuVector<T>;
+using CuIntVec = Opm::gpuistl::GpuVector<int>;
+using CuFloatingPointVec = Opm::gpuistl::GpuVector<T>;
 using GpuDilu1x1 = Opm::gpuistl::GpuDILU<Sp1x1BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
 using GpuDilu2x2 = Opm::gpuistl::GpuDILU<Sp2x2BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;

--- a/tests/cuistl/test_GpuJac.cpp
+++ b/tests/cuistl/test_GpuJac.cpp
@ -25,7 +25,7 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <opm/simulators/linalg/cuistl/GpuJac.hpp>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
@ -49,7 +49,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(GPUJACApplyBlocksize2, T, NumericTypes)
    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
-    using GpuJac = Opm::gpuistl::GpuJac<SpMatrix, Opm::gpuistl::CuVector<T>, Opm::gpuistl::CuVector<T>>;
+    using GpuJac = Opm::gpuistl::GpuJac<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
@ -103,7 +103,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(GPUJACApplyBlocksize1, T, NumericTypes)
    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
-    using GpuJac = Opm::gpuistl::GpuJac<SpMatrix, Opm::gpuistl::CuVector<T>, Opm::gpuistl::CuVector<T>>;
+    using GpuJac = Opm::gpuistl::GpuJac<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
--- a/tests/cuistl/test_GpuOwnerOverlapCopy.cpp
+++ b/tests/cuistl/test_GpuOwnerOverlapCopy.cpp
@ -27,7 +27,7 @@
 #include <dune/istl/owneroverlapcopy.hh>
 #include <memory>
 #include <opm/simulators/linalg/cuistl/GpuOwnerOverlapCopy.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/set_device.hpp>
 #include <random>
@ -62,7 +62,7 @@ BOOST_AUTO_TEST_CASE(TestProject)

    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
-    auto xGPU = Opm::gpuistl::CuVector<double>(xCPU);
+    auto xGPU = Opm::gpuistl::GpuVector<double>(xCPU);

    auto gpuComm = std::make_shared<Opm::gpuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
    
@ -94,7 +94,7 @@ BOOST_AUTO_TEST_CASE(TestDot)
    indexInfo.addRemoteIndex(std::make_tuple(0, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
-    auto xGPU = Opm::gpuistl::CuVector<double>(xCPU);
+    auto xGPU = Opm::gpuistl::GpuVector<double>(xCPU);

    auto gpuComm = std::make_shared<Opm::gpuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);

--- a/tests/cuistl/test_GpuSeqILU0.cpp
+++ b/tests/cuistl/test_GpuSeqILU0.cpp
@ -28,7 +28,7 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/preconditioners.hh>
 #include <opm/simulators/linalg/cuistl/GpuSeqILU0.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>

@ -63,7 +63,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifference1D, T, NumericTypes)
    using M = Dune::FieldMatrix<T, 1, 1>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, 1>>;
-    using GpuILU0 = Opm::gpuistl::GpuSeqILU0<SpMatrix, Opm::gpuistl::CuVector<T>, Opm::gpuistl::CuVector<T>>;
+    using GpuILU0 = Opm::gpuistl::GpuSeqILU0<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
@ -158,7 +158,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifferenceBlock2, T, NumericTypes)
    using M = Dune::FieldMatrix<T, 2, 2>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, 2>>;
-    using GpuILU0 = Opm::gpuistl::GpuSeqILU0<SpMatrix, Opm::gpuistl::CuVector<T>, Opm::gpuistl::CuVector<T>>;
+    using GpuILU0 = Opm::gpuistl::GpuSeqILU0<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
--- a/tests/cuistl/test_GpuSparseMatrix.cpp
+++ b/tests/cuistl/test_GpuSparseMatrix.cpp
@ -24,7 +24,7 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <memory>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <random>

@ -149,8 +149,8 @@ BOOST_AUTO_TEST_CASE(RandomSparsityMatrix)
        std::vector<double> inputDataX(N * dim, 0.0);
        inputDataX[component] = 1.0;
        std::vector<double> inputDataY(N * dim, .25);
-        auto inputVectorX = Opm::gpuistl::CuVector<double>(inputDataX.data(), inputDataX.size());
-        auto inputVectorY = Opm::gpuistl::CuVector<double>(inputDataY.data(), inputDataY.size());
+        auto inputVectorX = Opm::gpuistl::GpuVector<double>(inputDataX.data(), inputDataX.size());
+        auto inputVectorY = Opm::gpuistl::GpuVector<double>(inputDataY.data(), inputDataY.size());
        Vector xHost(N), yHost(N);
        yHost = inputDataY[0];
        inputVectorX.copyToHost(xHost);
--- a/tests/cuistl/test_GpuVector.cpp
+++ b/tests/cuistl/test_GpuVector.cpp
@ -18,13 +18,13 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuVector
+#define BOOST_TEST_MODULE TestGpuVector

 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/common/fvector.hh>
 #include <dune/istl/bvector.hh>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <random>

@ -32,7 +32,7 @@ BOOST_AUTO_TEST_CASE(TestDocumentedUsage)
 {
    auto someDataOnCPU = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});

-    auto dataOnGPU = ::Opm::gpuistl::CuVector<double>(someDataOnCPU);
+    auto dataOnGPU = ::Opm::gpuistl::GpuVector<double>(someDataOnCPU);

    // Multiply by 4.0:
    dataOnGPU *= 4.0;
@ -50,14 +50,14 @@ BOOST_AUTO_TEST_CASE(TestDocumentedUsage)
 BOOST_AUTO_TEST_CASE(TestConstructionSize)
 {
    const int numberOfElements = 1234;
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(numberOfElements);
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(numberOfElements);
    BOOST_CHECK_EQUAL(numberOfElements, vectorOnGPU.dim());
 }

 BOOST_AUTO_TEST_CASE(TestCopyFromHostConstructor)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
    std::vector<double> buffer(data.size(), 0.0);
    vectorOnGPU.copyToHost(buffer.data(), buffer.size());
@ -68,7 +68,7 @@ BOOST_AUTO_TEST_CASE(TestCopyFromHostConstructor)
 BOOST_AUTO_TEST_CASE(TestCopyFromHostFunction)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.size());
    BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
    vectorOnGPU.copyFromHost(data.data(), data.size());
    std::vector<double> buffer(data.size(), 0.0);
@ -80,7 +80,7 @@ BOOST_AUTO_TEST_CASE(TestCopyFromHostFunction)
 BOOST_AUTO_TEST_CASE(TestCopyFromBvector)
 {
    auto blockVector = Dune::BlockVector<Dune::FieldVector<double, 2>> {{{42, 43}, {44, 45}, {46, 47}}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(blockVector.dim());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(blockVector.dim());
    vectorOnGPU.copyFromHost(blockVector);
    std::vector<double> buffer(vectorOnGPU.dim());
    vectorOnGPU.copyToHost(buffer.data(), buffer.size());
@ -93,7 +93,7 @@ BOOST_AUTO_TEST_CASE(TestCopyToBvector)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7, 8, 9}};
    auto blockVector = Dune::BlockVector<Dune::FieldVector<double, 3>>(3);
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    vectorOnGPU.copyToHost(blockVector);


@ -103,7 +103,7 @@ BOOST_AUTO_TEST_CASE(TestCopyToBvector)
 BOOST_AUTO_TEST_CASE(TestDataPointer)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7, 8, 9}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());

    std::vector<double> buffer(data.size(), 0.0);
    OPM_CUDA_SAFE_CALL(cudaMemcpy(buffer.data(), vectorOnGPU.data(), sizeof(double) * data.size(), cudaMemcpyDeviceToHost));
@ -113,7 +113,7 @@ BOOST_AUTO_TEST_CASE(TestDataPointer)
 BOOST_AUTO_TEST_CASE(TestCopyScalarMultiply)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
    const double scalar = 42.25;
    vectorOnGPU *= scalar;
@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(TestCopyScalarMultiply)
 BOOST_AUTO_TEST_CASE(TestTwoNorm)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    auto twoNorm = vectorOnGPU.two_norm();

    double correctAnswer = 0.0;
@ -143,8 +143,8 @@ BOOST_AUTO_TEST_CASE(TestDot)
 {
    std::vector<double> dataA {{1, 2, 3, 4, 5, 6, 7}};
    std::vector<double> dataB {{8, 9, 10, 11, 12, 13, 14}};
-    auto vectorOnGPUA = Opm::gpuistl::CuVector<double>(dataA.data(), dataA.size());
-    auto vectorOnGPUB = Opm::gpuistl::CuVector<double>(dataB.data(), dataB.size());
+    auto vectorOnGPUA = Opm::gpuistl::GpuVector<double>(dataA.data(), dataA.size());
+    auto vectorOnGPUB = Opm::gpuistl::GpuVector<double>(dataB.data(), dataB.size());
    auto dot = vectorOnGPUA.dot(vectorOnGPUB);

    double correctAnswer = 0.0;
@ -158,7 +158,7 @@ BOOST_AUTO_TEST_CASE(TestDot)
 BOOST_AUTO_TEST_CASE(Assigment)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    vectorOnGPU = 10.0;
    vectorOnGPU.copyToHost(data.data(), data.size());

@ -171,9 +171,9 @@ BOOST_AUTO_TEST_CASE(Assigment)
 BOOST_AUTO_TEST_CASE(CopyAssignment)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::gpuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    vectorOnGPU.copyToHost(data.data(), data.size());
-    auto vectorOnGPUB = Opm::gpuistl::CuVector<double>(data.size());
+    auto vectorOnGPUB = Opm::gpuistl::GpuVector<double>(data.size());
    vectorOnGPUB = 4.0;
    vectorOnGPUB = vectorOnGPU;

@ -185,7 +185,7 @@ BOOST_AUTO_TEST_CASE(CopyAssignment)
 BOOST_AUTO_TEST_CASE(RandomVectors)
 {

-    using GVector = Opm::gpuistl::CuVector<double>;
+    using GVector = Opm::gpuistl::GpuVector<double>;
    std::srand(0);
    std::mt19937 generator;
    std::uniform_real_distribution<double> distribution(-100.0, 100.0);
@ -268,7 +268,7 @@ BOOST_AUTO_TEST_CASE(RandomVectors)
                indexSet.push_back(i);
            }
        }
-        auto indexSetGPU = Opm::gpuistl::CuVector<int>(indexSet);
+        auto indexSetGPU = Opm::gpuistl::GpuVector<int>(indexSet);

        aGPU.setZeroAtIndexSet(indexSetGPU);
        auto projectedA = aGPU.asStdVector();
--- a/tests/cuistl/test_cuSparse_matrix_operations.cpp
+++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp
@ -24,7 +24,7 @@
 #include <cuda_runtime.h>
 #include <dune/istl/bcrsmatrix.hh>
 #include <opm/simulators/linalg/cuistl/GpuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
    B[1][1][2][2] = -1.0;

    Opm::gpuistl::GpuSparseMatrix<T> m = Opm::gpuistl::GpuSparseMatrix<T>::fromMatrix(B);
-    Opm::gpuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
+    Opm::gpuistl::GpuVector<T> dInvDiag(blocksize * blocksize * N);

    Opm::gpuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
@ -160,7 +160,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
    B[1][1][1][1] = -1.0;

    Opm::gpuistl::GpuSparseMatrix<T> m = Opm::gpuistl::GpuSparseMatrix<T>::fromMatrix(B);
-    Opm::gpuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
+    Opm::gpuistl::GpuVector<T> dInvDiag(blocksize * blocksize * N);

    Opm::gpuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
--- a/tests/cuistl/test_cuVector_operations.cpp
+++ b/tests/cuistl/test_cuVector_operations.cpp
@ -18,13 +18,13 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuVectorOperations
+#define BOOST_TEST_MODULE TestGpuVectorOperations
 #include <boost/mpl/list.hpp>
 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/istl/bcrsmatrix.hh>
 #include <opm/simulators/linalg/cuistl/GpuJac.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/GpuVector.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
@ -47,9 +47,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVecto
    std::vector<T> hostBlockVector({1.0, 2.0, 3.0, 5.0, 2.0, 3.0, 2.0, 1.0, 2.0});
    std::vector<T> hostVecVector({3.0, 2.0, 1.0});
    std::vector<T> hostDstVector({0, 0, 0});
-    Opm::gpuistl::CuVector<T> deviceBlockVector(hostBlockVector);
-    Opm::gpuistl::CuVector<T> deviceVecVector(hostVecVector);
-    Opm::gpuistl::CuVector<T> deviceDstVector(hostDstVector);
+    Opm::gpuistl::GpuVector<T> deviceBlockVector(hostBlockVector);
+    Opm::gpuistl::GpuVector<T> deviceVecVector(hostVecVector);
+    Opm::gpuistl::GpuVector<T> deviceDstVector(hostDstVector);

    Opm::gpuistl::detail::weightedDiagMV(
        deviceBlockVector.data(), N, blocksize, weight, deviceVecVector.data(), deviceDstVector.data());
@ -81,9 +81,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
    std::vector<T> hostBlockVector({1.0, 2.0, 3.0, 4.0, 4.0, 3.0, 2.0, 1.0});
    std::vector<T> hostVecVector({1.0, 3.0, 2.0, 4.0});
    std::vector<T> hostDstVector({0, 0, 0, 0});
-    Opm::gpuistl::CuVector<T> deviceBlockVector(hostBlockVector);
-    Opm::gpuistl::CuVector<T> deviceVecVector(hostVecVector);
-    Opm::gpuistl::CuVector<T> deviceDstVector(hostDstVector);
+    Opm::gpuistl::GpuVector<T> deviceBlockVector(hostBlockVector);
+    Opm::gpuistl::GpuVector<T> deviceVecVector(hostVecVector);
+    Opm::gpuistl::GpuVector<T> deviceDstVector(hostDstVector);

    Opm::gpuistl::detail::weightedDiagMV(
        deviceBlockVector.data(), N, blocksize, weight, deviceVecVector.data(), deviceDstVector.data());