Merge pull request #5554 from multitalentloes/refactor_cuistl

refactor cuistl to gpuistl
2025-02-25 18:55:30 -06:00 · 2024-08-26 09:55:13 +02:00 · 2024-08-26 09:55:13 +02:00 · f97389d1b5
commit f97389d1b5
parent 2e15a04db5 d14fed904a
82 changed files with 1035 additions and 1035 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -530,7 +530,7 @@ if(CUDA_FOUND)
  if (NOT USE_HIP)
    target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
    target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
-    foreach(tgt test_cuda_safe_call test_cuda_check_last_error test_cuvector)
+    foreach(tgt test_gpu_safe_call test_cuda_check_last_error test_GpuVector)
      target_link_libraries(${tgt} CUDA::cudart)
    endforeach()
  endif()
@ -545,21 +545,21 @@ if(CUDA_FOUND)
  endif()
  set_tests_properties(cusparse_safe_call
                       cublas_safe_call
-                       cuda_safe_call
+                       gpu_safe_call
                       cuda_check_last_error
                       cublas_handle
-                       cujac
-                       cudilu
+                       GpuJac
+                       GpuDILU
                       cusparse_handle
                       cuSparse_matrix_operations
                       cuVector_operations
-                       cuvector
-                       cusparsematrix
-                       cuseqilu0
-                       cuowneroverlapcopy
+                       GpuVector
+                       GpuSparseMatrix
+                       GpuSeqILU0
+                       GpuOwnerOverlapCopy
                       solver_adapter
-                       cubuffer
-                       cuview
+                       GpuBuffer
+                       GpuView
                       PROPERTIES LABELS ${gpu_label})
 endif()

--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -28,15 +28,15 @@
 # hipification, we a dependency that will trigger when the cuda source code is
 # changed.
 macro (ADD_CUDA_OR_HIP_FILE LIST DIR FILE)
-  set (cuda_file_path "${PROJECT_SOURCE_DIR}/${DIR}/cuistl/${FILE}")
+  set (cuda_file_path "${PROJECT_SOURCE_DIR}/${DIR}/gpuistl/${FILE}")

  if(CUDA_FOUND AND NOT CONVERT_CUDA_TO_HIP)
-    list (APPEND ${LIST} "${DIR}/cuistl/${FILE}")
+    list (APPEND ${LIST} "${DIR}/gpuistl/${FILE}")
  else()
    # we must hipify the code
    # and include the correct path which is in the build/binary dir
    string(REPLACE ".cu" ".hip" HIP_SOURCE_FILE ${FILE})
-    set (hip_file_path "${PROJECT_BINARY_DIR}/${DIR}/hipistl/${HIP_SOURCE_FILE}")
+    set (hip_file_path "${PROJECT_BINARY_DIR}/${DIR}/gpuistl_hip/${HIP_SOURCE_FILE}")
    file(RELATIVE_PATH relpath ${PROJECT_SOURCE_DIR} ${hip_file_path})

    # add a custom command that will hipify
@ -207,42 +207,42 @@ endif()
 # add these files if we should compile the hip code
 if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuBlasHandle.cpp)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuBuffer.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuSparseMatrix.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuDILU.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg OpmCuILU0.cpp)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp)
-  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuJac.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuSeqILU0.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)

  # HEADERS
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/autotuner.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpu_safe_call.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cublas_safe_call.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_check_last_error.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuBuffer.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuDILU.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSparseMatrix.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuJac.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuVector.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuView.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuSparseMatrix.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuMatrixDescription.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseResource.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseResource_impl.hpp)
@ -256,12 +256,12 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/deviceBlockOperations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpuThreadUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuSeqILU0.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerConvertFieldTypeAdapter.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuOwnerOverlapCopy.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuOwnerOverlapCopy.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg SolverAdapter.hpp)
-  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBlockPreconditioner.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg GpuBlockPreconditioner.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerHolder.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg set_device.hpp)
 endif()
@ -389,19 +389,19 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_converttofloatadapter.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cublas_handle.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cublas_safe_call.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cubuffer.cu)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuview.cu)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuBuffer.cu)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuView.cu)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cusparse_safe_call.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuda_safe_call.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_gpu_safe_call.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuda_check_last_error.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cudilu.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cujac.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuowneroverlapcopy.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuseqilu0.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuDILU.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuJac.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuOwnerOverlapCopy.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuSeqILU0.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cusparse_handle.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuSparse_matrix_operations.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cusparsematrix.cpp)
-  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuvector.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuSparseMatrix.cpp)
+  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_GpuVector.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_cuVector_operations.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_safe_conversion.cpp)
  ADD_CUDA_OR_HIP_FILE(TEST_SOURCE_FILES tests test_solver_adapter.cpp)
--- a/bin/hipify_file.sh
+++ b/bin/hipify_file.sh
@ -15,6 +15,6 @@ hipify-perl $input_file > $output_file
 sed -i 's/^#include <hipblas\.h>/#include <hipblas\/hipblas.h>/g' $output_file
 sed -i 's/^#include <hipsparse\.h>/#include <hipsparse\/hipsparse.h>/g' $output_file
 # make sure includes refer to hipistl/ files (the ones that are also hipified)
-sed -i 's/cuistl\//hipistl\//g' $output_file
+sed -i 's/gpuistl\//gpuistl_hip\//g' $output_file

 echo "$output_file hipified"
--- a/opm/simulators/flow/Main.cpp
+++ b/opm/simulators/flow/Main.cpp
@ -36,7 +36,7 @@
 #endif

 #if HAVE_CUDA
-#include <opm/simulators/linalg/cuistl/set_device.hpp>
+#include <opm/simulators/linalg/gpuistl/set_device.hpp>
 #endif

 namespace Opm {
@ -163,7 +163,7 @@ void Main::initMPI()
    }

 #if HAVE_CUDA
-    Opm::cuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
+    Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
 #endif

 #endif // HAVE_MPI
--- a/opm/simulators/linalg/FlexibleSolver_impl.hpp
+++ b/opm/simulators/linalg/FlexibleSolver_impl.hpp
@ -39,9 +39,9 @@

 #if HAVE_CUDA
 #if USE_HIP
-#include <opm/simulators/linalg/hipistl/SolverAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/SolverAdapter.hpp>
 #else
-#include <opm/simulators/linalg/cuistl/SolverAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/SolverAdapter.hpp>
 #endif
 #endif

@ -205,7 +205,7 @@ namespace Dune
 #endif
 #if HAVE_CUDA
        } else if (solver_type == "cubicgstab") {
-            linsolver_.reset(new Opm::cuistl::SolverAdapter<Operator, Dune::BiCGSTABSolver, VectorType>(
+            linsolver_.reset(new Opm::gpuistl::SolverAdapter<Operator, Dune::BiCGSTABSolver, VectorType>(
                *linearoperator_for_solver_,
                *scalarproduct_,
                preconditioner_,
--- a/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp
+++ b/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp
@ -22,22 +22,22 @@
 // both with the normal cuistl path, and the hipistl path
 #if HAVE_CUDA
 #if USE_HIP
-#include <opm/simulators/linalg/hipistl/CuBlockPreconditioner.hpp>
-#include <opm/simulators/linalg/hipistl/CuDILU.hpp>
-#include <opm/simulators/linalg/hipistl/OpmCuILU0.hpp>
-#include <opm/simulators/linalg/hipistl/CuJac.hpp>
-#include <opm/simulators/linalg/hipistl/CuSeqILU0.hpp>
-#include <opm/simulators/linalg/hipistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/hipistl/PreconditionerConvertFieldTypeAdapter.hpp>
-#include <opm/simulators/linalg/hipistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/GpuBlockPreconditioner.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/GpuDILU.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/OpmCuILU0.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/GpuJac.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/GpuSeqILU0.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/PreconditionerConvertFieldTypeAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl_hip/detail/gpu_safe_call.hpp>
 #else
-#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
-#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
-#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
-#include <opm/simulators/linalg/cuistl/CuJac.hpp>
-#include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuBlockPreconditioner.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuDILU.hpp>
+#include <opm/simulators/linalg/gpuistl/OpmCuILU0.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuJac.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSeqILU0.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerConvertFieldTypeAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 #endif
 #endif
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -322,39 +322,39 @@ struct StandardPreconditioners {
        }

 #if HAVE_CUDA
-        F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
+        F::addCreator("GPUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
-            using CuILU0 = typename cuistl::
-                CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
-            auto cuILU0 = std::make_shared<CuILU0>(op.getmat(), w);
+            using GpuILU0 = typename gpuistl::
+                GpuSeqILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
+            auto gpuILU0 = std::make_shared<GpuILU0>(op.getmat(), w);

-            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
-            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuILU0>>(gpuILU0);
+            auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });

-        F::addCreator("CUJac", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
+        F::addCreator("GPUJAC", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
-            using CuJac =
-                typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
-            auto cuJac = std::make_shared<CuJac>(op.getmat(), w);
+            using GpuJac =
+                typename gpuistl::GpuJac<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
+            auto gpuJac = std::make_shared<GpuJac>(op.getmat(), w);

-            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
-            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuJac>>(gpuJac);
+            auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });

-        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
+        F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
-            auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
+            using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
+            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);

-            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
-            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
+            auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });

@ -362,11 +362,11 @@ struct StandardPreconditioners {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using OpmCuILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+            using OpmCuILU0 = typename gpuistl::OpmCuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
            auto cuilu0 = std::make_shared<OpmCuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);

-            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
-            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
+            auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
 #endif
@ -582,68 +582,68 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            });

 #if HAVE_CUDA
-        F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
+        F::addCreator("GPUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
-            using CuILU0 = typename cuistl::
-                CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
-            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(
-                std::make_shared<CuILU0>(op.getmat(), w));
+            using GpuILU0 = typename gpuistl::
+                GpuSeqILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuILU0>>(
+                std::make_shared<GpuILU0>(op.getmat(), w));
        });

-        F::addCreator("CUILU0Float", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
+        F::addCreator("GPUILU0Float", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
            using block_type = typename V::block_type;
            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
            using matrix_type_to =
                typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
-            using CuILU0 = typename cuistl::
-                CuSeqILU0<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
-            using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
-            using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
+            using GpuILU0 = typename gpuistl::
+                GpuSeqILU0<matrix_type_to, gpuistl::GpuVector<float>, gpuistl::GpuVector<float>>;
+            using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuILU0>;
+            using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
-            auto adapted = std::make_shared<Adapter>(std::make_shared<CuILU0>(converted->getConvertedMatrix(), w));
+            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuILU0>(converted->getConvertedMatrix(), w));
            converted->setUnderlyingPreconditioner(adapted);
            return converted;
        });

-        F::addCreator("CUJac", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
+        F::addCreator("GPUJAC", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
            const double w = prm.get<double>("relaxation", 1.0);
            using field_type = typename V::field_type;
-            using CUJac =
-                typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
-            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUJac>>(
-                std::make_shared<CUJac>(op.getmat(), w));
+            using GPUJac =
+                typename gpuistl::GpuJac<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUJac>>(
+                std::make_shared<GPUJac>(op.getmat(), w));
        });

        F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using CUILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+            using CUILU0 = typename gpuistl::OpmCuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;

-            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
        });

-        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
+        F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
-            using CUDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
-            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat(), split_matrix, tune_gpu_kernels));
+            using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels));
        });

-        F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
+        F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using block_type = typename V::block_type;
            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
            using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
-            using CuDILU = typename cuistl::CuDILU<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
-            using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
-            using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
+            using GpuDILU = typename gpuistl::GpuDILU<matrix_type_to, gpuistl::GpuVector<float>, gpuistl::GpuVector<float>>;
+            using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
+            using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
-            auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels));
+            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels));
            converted->setUnderlyingPreconditioner(adapted);
            return converted;
        });
--- a/opm/simulators/linalg/gpuistl/GpuBlockPreconditioner.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBlockPreconditioner.hpp
@ -16,22 +16,22 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUISTL_CUBLOCKPRECONDITIONER_HPP
-#define OPM_CUISTL_CUBLOCKPRECONDITIONER_HPP
+#ifndef OPM_CUISTL_GPUBLOCKPRECONDITIONER_HPP
+#define OPM_CUISTL_GPUBLOCKPRECONDITIONER_HPP

 #include <dune/common/shared_ptr.hh>
 #include <memory>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerHolder.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditioner_should_call_post_pre.hpp>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! @brief Is an adaptation of Dune::BlockPreconditioner that works within the CuISTL framework.
 //!
 //! @note We aim to intgrate this into OwningBlockPreconditioner (or a relative thereof).
 template <class X, class Y, class C, class P = Dune::PreconditionerWithUpdate<X, Y>>
-class CuBlockPreconditioner : public Dune::PreconditionerWithUpdate<X, Y>, public PreconditionerHolder<X, Y>
+class GpuBlockPreconditioner : public Dune::PreconditionerWithUpdate<X, Y>, public PreconditionerHolder<X, Y>
 {
 public:
    using domain_type = X;
@ -47,13 +47,13 @@ public:
    //! @param c The communication object for syncing overlap and copy
    //! data points. (E.~g. OwnerOverlapCopyCommunication )
    //!
-    CuBlockPreconditioner(const std::shared_ptr<P>& p, const std::shared_ptr<const communication_type>& c)
+    GpuBlockPreconditioner(const std::shared_ptr<P>& p, const std::shared_ptr<const communication_type>& c)
        : m_preconditioner(p)
        , m_communication(c)
    {
    }

-    CuBlockPreconditioner(const std::shared_ptr<P>& p, const communication_type& c)
+    GpuBlockPreconditioner(const std::shared_ptr<P>& p, const communication_type& c)
        : m_preconditioner(p)
        , m_communication(Dune::stackobject_to_shared_ptr(c))
    {
@ -125,5 +125,5 @@ private:
    //! \brief the communication object
    std::shared_ptr<const communication_type> m_communication;
 };
-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #endif
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@ -20,83 +20,83 @@
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <fmt/core.h>
-#include <opm/simulators/linalg/cuistl/CuBuffer.hpp>
-#include <opm/simulators/linalg/cuistl/CuView.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuBuffer.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class T>
-CuBuffer<T>::CuBuffer(const std::vector<T>& data)
-    : CuBuffer(data.data(), data.size())
+GpuBuffer<T>::GpuBuffer(const std::vector<T>& data)
+    : GpuBuffer(data.data(), data.size())
 {
 }

 template <class T>
-CuBuffer<T>::CuBuffer(const size_t numberOfElements)
+GpuBuffer<T>::GpuBuffer(const size_t numberOfElements)
    : m_numberOfElements(numberOfElements)
 {
    if (numberOfElements < 1) {
-        OPM_THROW(std::invalid_argument, "Setting a CuBuffer size to a non-positive number is not allowed");
+        OPM_THROW(std::invalid_argument, "Setting a GpuBuffer size to a non-positive number is not allowed");
    }
-    OPM_CUDA_SAFE_CALL(cudaMalloc(&m_dataOnDevice, sizeof(T) * m_numberOfElements));
+    OPM_GPU_SAFE_CALL(cudaMalloc(&m_dataOnDevice, sizeof(T) * m_numberOfElements));
 }

 template <class T>
-CuBuffer<T>::CuBuffer(const T* dataOnHost, const size_t numberOfElements)
-    : CuBuffer(numberOfElements)
+GpuBuffer<T>::GpuBuffer(const T* dataOnHost, const size_t numberOfElements)
+    : GpuBuffer(numberOfElements)
 {

-    OPM_CUDA_SAFE_CALL(cudaMemcpy(
+    OPM_GPU_SAFE_CALL(cudaMemcpy(
        m_dataOnDevice, dataOnHost, m_numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }

 template <class T>
-CuBuffer<T>::CuBuffer(const CuBuffer<T>& other)
-    : CuBuffer(other.m_numberOfElements)
+GpuBuffer<T>::GpuBuffer(const GpuBuffer<T>& other)
+    : GpuBuffer(other.m_numberOfElements)
 {
    assertHasElements();
    assertSameSize(other);
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(m_dataOnDevice,
+    OPM_GPU_SAFE_CALL(cudaMemcpy(m_dataOnDevice,
                                  other.m_dataOnDevice,
                                  m_numberOfElements * sizeof(T),
                                  cudaMemcpyDeviceToDevice));
 }

 template <class T>
-CuBuffer<T>::~CuBuffer()
+GpuBuffer<T>::~GpuBuffer()
 {
-    OPM_CUDA_WARN_IF_ERROR(cudaFree(m_dataOnDevice));
+    OPM_GPU_WARN_IF_ERROR(cudaFree(m_dataOnDevice));
 }

 template <typename T>
-typename CuBuffer<T>::size_type
-CuBuffer<T>::size() const
+typename GpuBuffer<T>::size_type
+GpuBuffer<T>::size() const
 {
    return m_numberOfElements;
 }

 template <typename T>
 void
-CuBuffer<T>::resize(size_t newSize)
+GpuBuffer<T>::resize(size_t newSize)
 {
    if (newSize < 1) {
-        OPM_THROW(std::invalid_argument, "Setting a CuBuffer size to a non-positive number is not allowed");
+        OPM_THROW(std::invalid_argument, "Setting a GpuBuffer size to a non-positive number is not allowed");
    }
    // Allocate memory for the new buffer
    T* tmpBuffer = nullptr;
-    OPM_CUDA_SAFE_CALL(cudaMalloc(&tmpBuffer, sizeof(T) * newSize));
+    OPM_GPU_SAFE_CALL(cudaMalloc(&tmpBuffer, sizeof(T) * newSize));

    // Move the data from the old to the new buffer with truncation
    size_t sizeOfMove = std::min({m_numberOfElements, newSize});
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(tmpBuffer,
+    OPM_GPU_SAFE_CALL(cudaMemcpy(tmpBuffer,
                                  m_dataOnDevice,
                                  sizeOfMove * sizeof(T),
                                  cudaMemcpyDeviceToDevice));

    // free the old buffer
-    OPM_CUDA_SAFE_CALL(cudaFree(m_dataOnDevice));
+    OPM_GPU_SAFE_CALL(cudaFree(m_dataOnDevice));

    // swap the buffers
    m_dataOnDevice = tmpBuffer;
@ -107,7 +107,7 @@ CuBuffer<T>::resize(size_t newSize)

 template <typename T>
 std::vector<T>
-CuBuffer<T>::asStdVector() const
+GpuBuffer<T>::asStdVector() const
 {
    std::vector<T> temporary(m_numberOfElements);
    copyToHost(temporary);
@ -116,14 +116,14 @@ CuBuffer<T>::asStdVector() const

 template <typename T>
 void
-CuBuffer<T>::assertSameSize(const CuBuffer<T>& x) const
+GpuBuffer<T>::assertSameSize(const GpuBuffer<T>& x) const
 {
    assertSameSize(x.m_numberOfElements);
 }

 template <typename T>
 void
-CuBuffer<T>::assertSameSize(size_t size) const
+GpuBuffer<T>::assertSameSize(size_t size) const
 {
    if (size != m_numberOfElements) {
        OPM_THROW(std::invalid_argument,
@ -133,7 +133,7 @@ CuBuffer<T>::assertSameSize(size_t size) const

 template <typename T>
 void
-CuBuffer<T>::assertHasElements() const
+GpuBuffer<T>::assertHasElements() const
 {
    if (m_numberOfElements <= 0) {
        OPM_THROW(std::invalid_argument, "We have 0 elements");
@ -142,21 +142,21 @@ CuBuffer<T>::assertHasElements() const

 template <typename T>
 T*
-CuBuffer<T>::data()
+GpuBuffer<T>::data()
 {
    return m_dataOnDevice;
 }

 template <typename T>
 const T*
-CuBuffer<T>::data() const
+GpuBuffer<T>::data() const
 {
    return m_dataOnDevice;
 }

 template <class T>
 void
-CuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
+GpuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
 {
    if (numberOfElements > size()) {
        OPM_THROW(std::runtime_error,
@ -164,41 +164,41 @@ CuBuffer<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
                              size(),
                              numberOfElements));
    }
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }

 template <class T>
 void
-CuBuffer<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
+GpuBuffer<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
 {
    assertSameSize(numberOfElements);
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
 }

 template <class T>
 void
-CuBuffer<T>::copyFromHost(const std::vector<T>& data)
+GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
 {
    copyFromHost(data.data(), data.size());
 }
 template <class T>
 void
-CuBuffer<T>::copyToHost(std::vector<T>& data) const
+GpuBuffer<T>::copyToHost(std::vector<T>& data) const
 {
    copyToHost(data.data(), data.size());
 }

-template class CuBuffer<double>;
-template class CuBuffer<float>;
-template class CuBuffer<int>;
+template class GpuBuffer<double>;
+template class GpuBuffer<float>;
+template class GpuBuffer<int>;

 template <class T>
-CuView<const T> make_view(const CuBuffer<T>& buf) {
-    return CuView<const T>(buf.data(), buf.size());
+GpuView<const T> make_view(const GpuBuffer<T>& buf) {
+    return GpuView<const T>(buf.data(), buf.size());
 }

-template CuView<const double> make_view<double>(const CuBuffer<double>&);
-template CuView<const float> make_view<float>(const CuBuffer<float>&);
-template CuView<const int> make_view<int>(const CuBuffer<int>&);
+template GpuView<const double> make_view<double>(const GpuBuffer<double>&);
+template GpuView<const float> make_view<float>(const GpuBuffer<float>&);
+template GpuView<const int> make_view<int>(const GpuBuffer<int>&);

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@ -16,35 +16,35 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUBUFFER_HEADER_HPP
-#define OPM_CUBUFFER_HEADER_HPP
+#ifndef OPM_GPUBUFFER_HEADER_HPP
+#define OPM_GPUBUFFER_HEADER_HPP
 #include <dune/common/fvector.hh>
 #include <dune/istl/bvector.hh>
 #include <exception>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
-#include <opm/simulators/linalg/cuistl/CuView.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
 #include <vector>
 #include <string>


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 /**
- * @brief The CuBuffer class is a simple container class for the GPU.
+ * @brief The GpuBuffer class is a simple container class for the GPU.
 *
 *
 * Example usage:
 *
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/CuBuffer.hpp>
+ * #include <opm/simulators/linalg/gpuistl/GpuBuffer.hpp>
 *
 * void someFunction() {
 *     auto someDataOnCPU = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
 *
- *     auto dataOnGPU = CuBuffer<double>(someDataOnCPU);
+ *     auto dataOnGPU = GpuBuffer<double>(someDataOnCPU);
 *
 *     auto stdVectorOnCPU = dataOnGPU.asStdVector();
 * }
@ -52,7 +52,7 @@ namespace Opm::cuistl
 * @tparam T the type to store. Can be either float, double or int.
 */
 template <typename T>
-class CuBuffer
+class GpuBuffer
 {
 public:
    using field_type = T;
@ -60,17 +60,17 @@ public:
    using value_type = T;

    /**
-     * @brief CuBuffer allocates new GPU memory of the same size as other and copies the content of the other buffer to
+     * @brief GpuBuffer allocates new GPU memory of the same size as other and copies the content of the other buffer to
     * this newly allocated memory.
     *
     * @note This does synchronous transfer.
     *
     * @param other the buffer to copy from
     */
-    CuBuffer(const CuBuffer<T>& other);
+    GpuBuffer(const GpuBuffer<T>& other);

    /**
-     * @brief CuBuffer allocates new GPU memory of the same size as data and copies the content of the data vector to
+     * @brief GpuBuffer allocates new GPU memory of the same size as data and copies the content of the data vector to
     * this newly allocated memory.
     *
     * @note This does CPU to GPU transfer.
@ -78,23 +78,23 @@ public:
     *
     * @param data the vector to copy from
     */
-    explicit CuBuffer(const std::vector<T>& data);
+    explicit GpuBuffer(const std::vector<T>& data);

    /**
     * @brief Default constructor that will initialize cublas and allocate 0 bytes of memory
     */
-    explicit CuBuffer();
+    explicit GpuBuffer();

    /**
-     * @brief CuBuffer allocates new GPU memory of size numberOfElements * sizeof(T)
+     * @brief GpuBuffer allocates new GPU memory of size numberOfElements * sizeof(T)
     *
     * @param numberOfElements number of T elements to allocate
     */
-    explicit CuBuffer(const size_t numberOfElements);
+    explicit GpuBuffer(const size_t numberOfElements);


    /**
-     * @brief CuBuffer allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
+     * @brief GpuBuffer allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
     * data
     *
     * @note This assumes the data is on the CPU.
@ -102,12 +102,12 @@ public:
     * @param numberOfElements number of T elements to allocate
     * @param dataOnHost data on host/CPU
     */
-    CuBuffer(const T* dataOnHost, const size_t numberOfElements);
+    GpuBuffer(const T* dataOnHost, const size_t numberOfElements);

    /**
-     * @brief ~CuBuffer calls cudaFree
+     * @brief ~GpuBuffer calls cudaFree
     */
-    virtual ~CuBuffer();
+    virtual ~GpuBuffer();

    /**
     * @return the raw pointer to the GPU data
@ -120,7 +120,7 @@ public:
    const T* data() const;

    /**
-     * @return fetch the first element in a CuBuffer
+     * @return fetch the first element in a GpuBuffer
     */
    __host__ __device__ T& front()
    {
@ -131,7 +131,7 @@ public:
    }

    /**
-     * @return fetch the last element in a CuBuffer
+     * @return fetch the last element in a GpuBuffer
     */
    __host__ __device__ T& back()
    {
@ -142,7 +142,7 @@ public:
    }

    /**
-     * @return fetch the first element in a CuBuffer
+     * @return fetch the first element in a GpuBuffer
     */
    __host__ __device__ T front() const
    {
@ -153,7 +153,7 @@ public:
    }

    /**
-     * @return fetch the last element in a CuBuffer
+     * @return fetch the last element in a GpuBuffer
     */
    __host__ __device__ T back() const
    {
@ -176,7 +176,7 @@ public:
        // TODO: [perf] vector.size() can be replaced by bvector.N() * BlockDimension
        if (m_numberOfElements != bvector.size()) {
            OPM_THROW(std::runtime_error,
-                      fmt::format("Given incompatible vector size. CuBuffer has size {}, \n"
+                      fmt::format("Given incompatible vector size. GpuBuffer has size {}, \n"
                                  "however, BlockVector has N() = {}, and size = {}.",
                                  m_numberOfElements,
                                  bvector.N(),
@ -199,7 +199,7 @@ public:
        // TODO: [perf] vector.size() can be replaced by bvector.N() * BlockDimension
        if (m_numberOfElements != bvector.size()) {
            OPM_THROW(std::runtime_error,
-                      fmt::format("Given incompatible vector size. CuBuffer has size {},\n however, the BlockVector "
+                      fmt::format("Given incompatible vector size. GpuBuffer has size {},\n however, the BlockVector "
                                  "has has N() = {}, and size() = {}.",
                                  m_numberOfElements,
                                  bvector.N(),
@ -267,14 +267,14 @@ private:
    T* m_dataOnDevice = nullptr;
    size_t m_numberOfElements;

-    void assertSameSize(const CuBuffer<T>& other) const;
+    void assertSameSize(const GpuBuffer<T>& other) const;
    void assertSameSize(size_t size) const;

    void assertHasElements() const;
 };

 template <class T>
-CuView<const T> make_view(const CuBuffer<T>&);
+GpuView<const T> make_view(const GpuBuffer<T>&);

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #endif
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@ -25,28 +25,28 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
-#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
-#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/autotuner.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuDILU.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/coloringAndReorderingUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <tuple>
 #include <functional>
 #include <utility>
 #include <string>
-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class M, class X, class Y, int l>
-CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
+GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels)
    : m_cpuMatrix(A)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
    , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
-    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
+    , m_gpuMatrix(GpuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
    , m_gpuNaturalToReorder(m_naturalToReordered)
    , m_gpuReorderToNatural(m_reorderedToNatural)
    , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
@ -71,13 +71,13 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
-        m_gpuMatrixReorderedDiag = std::make_unique<CuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
+        m_gpuMatrixReorderedDiag = std::make_unique<GpuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
-            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
+            = detail::extractLowerAndUpperMatrices<M, field_type, GpuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
    }
    else {
-        m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
+        m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, GpuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@ -89,13 +89,13 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
+GpuDILU<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
 {
 }

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
+GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
@ -105,7 +105,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
+GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
 {
    int levelStartIdx = 0;
    for (int level = 0; level < m_levelSets.size(); ++level) {
@ -172,20 +172,20 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int u

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
+GpuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
 {
 }

 template <class M, class X, class Y, int l>
 Dune::SolverCategory::Category
-CuDILU<M, X, Y, l>::category() const
+GpuDILU<M, X, Y, l>::category() const
 {
    return Dune::SolverCategory::sequential;
 }

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::update()
+GpuDILU<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
@ -195,7 +195,7 @@ CuDILU<M, X, Y, l>::update()

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
@ -203,7 +203,7 @@ CuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
 {
    if (m_splitMatrix) {
        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -264,7 +264,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
+GpuDILU<M, X, Y, l>::tuneThreadBlockSizes()
 {
    // tune the thread-block size of the update function
    auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
@ -278,8 +278,8 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
    m_DILUFactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing DILU factorization");

    // tune the thread-block size of the apply
-    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
-    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
@ -293,14 +293,14 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
    m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply, "Kernel computing an upper triangular solve for a level set");
 }

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
-    template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,            \
-                                         ::Opm::cuistl::CuVector<realtype>,                                            \
-                                         ::Opm::cuistl::CuVector<realtype>>;                                           \
-    template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,             \
-                                         ::Opm::cuistl::CuVector<realtype>,                                            \
-                                         ::Opm::cuistl::CuVector<realtype>>
+    template class ::Opm::gpuistl::GpuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,            \
+                                         ::Opm::gpuistl::GpuVector<realtype>,                                            \
+                                         ::Opm::gpuistl::GpuVector<realtype>>;                                           \
+    template class ::Opm::gpuistl::GpuDILU<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,             \
+                                         ::Opm::gpuistl::GpuVector<realtype>,                                            \
+                                         ::Opm::gpuistl::GpuVector<realtype>>

 INSTANTIATE_CUDILU_DUNE(double, 1);
 INSTANTIATE_CUDILU_DUNE(double, 2);
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@ -16,18 +16,18 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUDILU_HPP
-#define OPM_CUDILU_HPP
+#ifndef OPM_GPUDILU_HPP
+#define OPM_GPUDILU_HPP

 #include <memory>
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <vector>



-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! \brief DILU preconditioner on the GPU.
 //!
@ -37,10 +37,10 @@ namespace Opm::cuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
-class CuDILU : public Dune::PreconditionerWithUpdate<X, Y>
+class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
 {
 public:
    //! \brief The matrix type the preconditioner is for.
@ -52,7 +52,7 @@ public:
    //! \brief The field type of the preconditioner.
    using field_type = typename X::field_type;
    //! \brief The GPU matrix type
-    using CuMat = CuSparseMatrix<field_type>;
+    using CuMat = GpuSparseMatrix<field_type>;

    //! \brief Constructor.
    //!
@ -60,7 +60,7 @@ public:
    //! \param A The matrix to operate on.
    //! \param w The relaxation factor.
    //!
-    explicit CuDILU(const M& A, bool splitMatrix, bool tuneKernels);
+    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels);

    //! \brief Prepare the preconditioner.
    //! \note Does nothing at the time being.
@ -126,13 +126,13 @@ private:
    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
    //! \brief If matrix splitting is enabled, we also store the diagonal separately
-    std::unique_ptr<CuVector<field_type>> m_gpuMatrixReorderedDiag;
+    std::unique_ptr<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
    //! row conversion from natural to reordered matrix indices stored on the GPU
-    CuVector<int> m_gpuNaturalToReorder;
+    GpuVector<int> m_gpuNaturalToReorder;
    //! row conversion from reordered to natural matrix indices stored on the GPU
-    CuVector<int> m_gpuReorderToNatural;
+    GpuVector<int> m_gpuReorderToNatural;
    //! \brief Stores the inverted diagonal that we use in DILU
-    CuVector<field_type> m_gpuDInv;
+    GpuVector<field_type> m_gpuDInv;
    //! \brief Bool storing whether or not we should store matrices in a split format
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
@ -144,6 +144,6 @@ private:
    int m_moveThreadBlockSize = -1;
    int m_DILUFactorizationThreadBlockSize = -1;
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/GpuJac.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuJac.cpp
@ -20,20 +20,20 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuJac.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuJac.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/vector_operations.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class M, class X, class Y, int l>
-CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
+GpuJac<M, X, Y, l>::GpuJac(const M& A, field_type w)
    : m_cpuMatrix(A)
    , m_relaxationFactor(w)
-    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(A))
+    , m_gpuMatrix(GpuSparseMatrix<field_type>::fromMatrix(A))
    , m_diagInvFlattened(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
 {
    // Some sanity check
@ -58,13 +58,13 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)

 template <class M, class X, class Y, int l>
 void
-CuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
+GpuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
 {
 }

 template <class M, class X, class Y, int l>
 void
-CuJac<M, X, Y, l>::apply(X& v, const Y& d)
+GpuJac<M, X, Y, l>::apply(X& v, const Y& d)
 {
    // Jacobi preconditioner: x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
    // Working with defect d and update v it we only need to set v = w*(D^-1)*d
@ -77,20 +77,20 @@ CuJac<M, X, Y, l>::apply(X& v, const Y& d)

 template <class M, class X, class Y, int l>
 void
-CuJac<M, X, Y, l>::post([[maybe_unused]] X& x)
+GpuJac<M, X, Y, l>::post([[maybe_unused]] X& x)
 {
 }

 template <class M, class X, class Y, int l>
 Dune::SolverCategory::Category
-CuJac<M, X, Y, l>::category() const
+GpuJac<M, X, Y, l>::category() const
 {
    return Dune::SolverCategory::sequential;
 }

 template <class M, class X, class Y, int l>
 void
-CuJac<M, X, Y, l>::update()
+GpuJac<M, X, Y, l>::update()
 {
    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
    invertDiagonalAndFlatten();
@ -98,7 +98,7 @@ CuJac<M, X, Y, l>::update()

 template <class M, class X, class Y, int l>
 void
-CuJac<M, X, Y, l>::invertDiagonalAndFlatten()
+GpuJac<M, X, Y, l>::invertDiagonalAndFlatten()
 {
    detail::JAC::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(
        m_gpuMatrix.getNonZeroValues().data(),
@ -108,14 +108,14 @@ CuJac<M, X, Y, l>::invertDiagonalAndFlatten()
        m_diagInvFlattened.data());
 }

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #define INSTANTIATE_CUJAC_DUNE(realtype, blockdim)                                                                     \
-    template class ::Opm::cuistl::CuJac<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,             \
-                                        ::Opm::cuistl::CuVector<realtype>,                                             \
-                                        ::Opm::cuistl::CuVector<realtype>>;                                            \
-    template class ::Opm::cuistl::CuJac<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,              \
-                                        ::Opm::cuistl::CuVector<realtype>,                                             \
-                                        ::Opm::cuistl::CuVector<realtype>>
+    template class ::Opm::gpuistl::GpuJac<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,             \
+                                        ::Opm::gpuistl::GpuVector<realtype>,                                             \
+                                        ::Opm::gpuistl::GpuVector<realtype>>;                                            \
+    template class ::Opm::gpuistl::GpuJac<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,              \
+                                        ::Opm::gpuistl::GpuVector<realtype>,                                             \
+                                        ::Opm::gpuistl::GpuVector<realtype>>

 INSTANTIATE_CUJAC_DUNE(double, 1);
 INSTANTIATE_CUJAC_DUNE(double, 2);
--- a/opm/simulators/linalg/gpuistl/GpuJac.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuJac.hpp
@ -16,19 +16,19 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUJAC_HPP
-#define OPM_CUJAC_HPP
+#ifndef OPM_GPUJAC_HPP
+#define OPM_GPUJAC_HPP

 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuMatrixDescription.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseResource.hpp>



-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! \brief Jacobi preconditioner on the GPU.
 //!
@ -40,10 +40,10 @@ namespace Opm::cuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
-class CuJac : public Dune::PreconditionerWithUpdate<X, Y>
+class GpuJac : public Dune::PreconditionerWithUpdate<X, Y>
 {
 public:
    //! \brief The matrix type the preconditioner is for.
@ -61,7 +61,7 @@ public:
    //! \param A The matrix to operate on.
    //! \param w The relaxation factor.
    //!
-    CuJac(const M& A, field_type w);
+    GpuJac(const M& A, field_type w);

    //! \brief Prepare the preconditioner.
    //! \note Does nothing at the time being.
@ -104,12 +104,12 @@ private:
    //! \brief The relaxation factor to use.
    const field_type m_relaxationFactor;
    //! \brief The A matrix stored on the gpu
-    CuSparseMatrix<field_type> m_gpuMatrix;
+    GpuSparseMatrix<field_type> m_gpuMatrix;
    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
-    CuVector<field_type> m_diagInvFlattened;
+    GpuVector<field_type> m_diagInvFlattened;

    void invertDiagonalAndFlatten();
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/GpuOwnerOverlapCopy.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuOwnerOverlapCopy.hpp
@ -16,15 +16,15 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
-#define OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
+#ifndef OPM_GPUISTL_GPUOWNEROVERLAPCOPY_HPP
+#define OPM_GPUISTL_GPUOWNEROVERLAPCOPY_HPP
 #include <dune/istl/owneroverlapcopy.hh>
 #include <memory>
 #include <mutex>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
 #include <vector>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 /**
 * @brief GPUSender is a wrapper class for classes which will implement copOwnerToAll
@ -36,7 +36,7 @@ namespace Opm::cuistl
 template<class field_type, class OwnerOverlapCopyCommunicationType>
 class GPUSender {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}

@ -97,8 +97,8 @@ protected:
    // premature optimization, in the sense that we could just initialize these indices
    // always, but they are not always used.
    mutable std::once_flag m_initializedIndices;
-    mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
-    mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
+    mutable std::unique_ptr<GpuVector<int>> m_indicesOwner;
+    mutable std::unique_ptr<GpuVector<int>> m_indicesCopy;
    const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
 };

@ -113,7 +113,7 @@ template <class field_type, int block_size, class OwnerOverlapCopyCommunicationT
 class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
 {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
        : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
@ -151,8 +151,8 @@ private:
            }
        }

-        this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
-        this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+        this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
+        this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
    }
 };

@ -168,7 +168,7 @@ template <class field_type, int block_size, class OwnerOverlapCopyCommunicationT
 class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
 {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

    GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
        : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
@ -178,7 +178,7 @@ public:
    void copyOwnerToAll(const X& source, X& dest) const override
    {

-        OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!!
+        OPM_ERROR_IF(&source != &dest, "The provided GpuVectors' address did not match"); // In this context, source == dest!!!
        std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });

        int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
@ -251,10 +251,10 @@ public:
    }

 private:
-    mutable std::unique_ptr<CuVector<int>> m_commpairIndicesCopy;
-    mutable std::unique_ptr<CuVector<int>> m_commpairIndicesOwner;
-    mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
-    mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
+    mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesCopy;
+    mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesOwner;
+    mutable std::unique_ptr<GpuVector<field_type>> m_GPUSendBuf;
+    mutable std::unique_ptr<GpuVector<field_type>> m_GPURecvBuf;

    struct MessageInformation
    {
@ -332,11 +332,11 @@ private:
            }
        }

-        m_commpairIndicesCopy = std::make_unique<CuVector<int>>(commpairIndicesCopyOnCPU);
-        m_commpairIndicesOwner = std::make_unique<CuVector<int>>(commpairIndicesOwnerCPU);
+        m_commpairIndicesCopy = std::make_unique<GpuVector<int>>(commpairIndicesCopyOnCPU);
+        m_commpairIndicesOwner = std::make_unique<GpuVector<int>>(commpairIndicesOwnerCPU);

-        m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
-        m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
+        m_GPUSendBuf = std::make_unique<GpuVector<field_type>>(sendBufIdx * block_size);
+        m_GPURecvBuf = std::make_unique<GpuVector<field_type>>(recvBufIdx * block_size);
    }

    void initIndexSet() const override
@ -360,8 +360,8 @@ private:
            }
        }

-        this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
-        this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
+        this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
+        this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);

        buildCommPairIdxs();
    }
@ -371,21 +371,21 @@ private:
 * @brief CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication
 *
 * This class can essentially be seen as an adapter around Dune::OwnerOverlapCopyCommunication, and should work as
- * a Dune::OwnerOverlapCopyCommunication on CuVectors
+ * a Dune::OwnerOverlapCopyCommunication on GpuVectors
 *
 * @note This currently only has the functionality to parallelize the linear solve.
 *
- * @tparam field_type should be a field_type supported by CuVector (double, float)
+ * @tparam field_type should be a field_type supported by GpuVector (double, float)
 * @tparam block_size the block size used (this is relevant for say figuring out the correct indices)
 * @tparam OwnerOverlapCopyCommunicationType should mimic Dune::OwnerOverlapCopyCommunication.
 */
 template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
-class CuOwnerOverlapCopy
+class GpuOwnerOverlapCopy
 {
 public:
-    using X = CuVector<field_type>;
+    using X = GpuVector<field_type>;

-    CuOwnerOverlapCopy(std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> sender) : m_sender(sender){}
+    GpuOwnerOverlapCopy(std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> sender) : m_sender(sender){}

    void copyOwnerToAll(const X& source, X& dest) const {
        m_sender->copyOwnerToAll(source, dest);
@ -409,5 +409,5 @@ public:
 private:
    std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
 };
-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #endif
--- a/opm/simulators/linalg/gpuistl/GpuSeqILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuSeqILU0.cpp
@ -25,26 +25,26 @@
 #include <dune/istl/bvector.hh>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
-#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSeqILU0.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_constants.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_wrapper.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>

 // This file is based on the guide at https://docs.nvidia.com/cuda/cusparse/index.html#csrilu02_solve ,
 // it highly recommended to read that before proceeding.


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class M, class X, class Y, int l>
-CuSeqILU0<M, X, Y, l>::CuSeqILU0(const M& A, field_type w)
+GpuSeqILU0<M, X, Y, l>::GpuSeqILU0(const M& A, field_type w)
    : m_underlyingMatrix(A)
    , m_w(w)
-    , m_LU(CuSparseMatrix<field_type>::fromMatrix(detail::makeMatrixWithNonzeroDiagonal(A)))
+    , m_LU(GpuSparseMatrix<field_type>::fromMatrix(detail::makeMatrixWithNonzeroDiagonal(A)))
    , m_temporaryStorage(m_LU.N() * m_LU.blockSize())
    , m_descriptionL(detail::createLowerDiagonalDescription())
    , m_descriptionU(detail::createUpperDiagonalDescription())
@ -70,13 +70,13 @@ CuSeqILU0<M, X, Y, l>::CuSeqILU0(const M& A, field_type w)

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
+GpuSeqILU0<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
 {
 }

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)
+GpuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {

    // We need to pass the solve routine a scalar to multiply.
@ -133,20 +133,20 @@ CuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::post([[maybe_unused]] X& x)
+GpuSeqILU0<M, X, Y, l>::post([[maybe_unused]] X& x)
 {
 }

 template <class M, class X, class Y, int l>
 Dune::SolverCategory::Category
-CuSeqILU0<M, X, Y, l>::category() const
+GpuSeqILU0<M, X, Y, l>::category() const
 {
    return Dune::SolverCategory::sequential;
 }

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::update()
+GpuSeqILU0<M, X, Y, l>::update()
 {
    m_LU.updateNonzeroValues(detail::makeMatrixWithNonzeroDiagonal(m_underlyingMatrix));
    createILU();
@ -154,7 +154,7 @@ CuSeqILU0<M, X, Y, l>::update()

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::analyzeMatrix()
+GpuSeqILU0<M, X, Y, l>::analyzeMatrix()
 {

    if (!m_buffer) {
@ -226,7 +226,7 @@ CuSeqILU0<M, X, Y, l>::analyzeMatrix()

 template <class M, class X, class Y, int l>
 size_t
-CuSeqILU0<M, X, Y, l>::findBufferSize()
+GpuSeqILU0<M, X, Y, l>::findBufferSize()
 {
    // We have three calls that need buffers:
    //   1) LU decomposition
@ -290,7 +290,7 @@ CuSeqILU0<M, X, Y, l>::findBufferSize()

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::createILU()
+GpuSeqILU0<M, X, Y, l>::createILU()
 {
    OPM_ERROR_IF(!m_buffer, "Buffer not initialized. Call findBufferSize() then initialize with the appropiate size.");
    OPM_ERROR_IF(!m_analysisDone, "Analyzis of matrix not done. Call analyzeMatrix() first.");
@ -328,35 +328,35 @@ CuSeqILU0<M, X, Y, l>::createILU()

 template <class M, class X, class Y, int l>
 void
-CuSeqILU0<M, X, Y, l>::updateILUConfiguration()
+GpuSeqILU0<M, X, Y, l>::updateILUConfiguration()
 {
    auto bufferSize = findBufferSize();
    if (!m_buffer || m_buffer->dim() < bufferSize) {
-        m_buffer.reset(new CuVector<field_type>((bufferSize + sizeof(field_type) - 1) / sizeof(field_type)));
+        m_buffer.reset(new GpuVector<field_type>((bufferSize + sizeof(field_type) - 1) / sizeof(field_type)));
    }
    analyzeMatrix();
    createILU();
 }
-} // namespace Opm::cuistl
-#define INSTANTIATE_CUSEQILU0_DUNE(realtype, blockdim)                                                                 \
-    template class ::Opm::cuistl::CuSeqILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
-                                            ::Opm::cuistl::CuVector<realtype>,                                         \
-                                            ::Opm::cuistl::CuVector<realtype>>;                                        \
-    template class ::Opm::cuistl::CuSeqILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
-                                            ::Opm::cuistl::CuVector<realtype>,                                         \
-                                            ::Opm::cuistl::CuVector<realtype>>
+} // namespace Opm::gpuistl
+#define INSTANTIATE_GPUSEQILU0_DUNE(realtype, blockdim)                                                                 \
+    template class ::Opm::gpuistl::GpuSeqILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>;                                        \
+    template class ::Opm::gpuistl::GpuSeqILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>


-INSTANTIATE_CUSEQILU0_DUNE(double, 1);
-INSTANTIATE_CUSEQILU0_DUNE(double, 2);
-INSTANTIATE_CUSEQILU0_DUNE(double, 3);
-INSTANTIATE_CUSEQILU0_DUNE(double, 4);
-INSTANTIATE_CUSEQILU0_DUNE(double, 5);
-INSTANTIATE_CUSEQILU0_DUNE(double, 6);
+INSTANTIATE_GPUSEQILU0_DUNE(double, 1);
+INSTANTIATE_GPUSEQILU0_DUNE(double, 2);
+INSTANTIATE_GPUSEQILU0_DUNE(double, 3);
+INSTANTIATE_GPUSEQILU0_DUNE(double, 4);
+INSTANTIATE_GPUSEQILU0_DUNE(double, 5);
+INSTANTIATE_GPUSEQILU0_DUNE(double, 6);

-INSTANTIATE_CUSEQILU0_DUNE(float, 1);
-INSTANTIATE_CUSEQILU0_DUNE(float, 2);
-INSTANTIATE_CUSEQILU0_DUNE(float, 3);
-INSTANTIATE_CUSEQILU0_DUNE(float, 4);
-INSTANTIATE_CUSEQILU0_DUNE(float, 5);
-INSTANTIATE_CUSEQILU0_DUNE(float, 6);
+INSTANTIATE_GPUSEQILU0_DUNE(float, 1);
+INSTANTIATE_GPUSEQILU0_DUNE(float, 2);
+INSTANTIATE_GPUSEQILU0_DUNE(float, 3);
+INSTANTIATE_GPUSEQILU0_DUNE(float, 4);
+INSTANTIATE_GPUSEQILU0_DUNE(float, 5);
+INSTANTIATE_GPUSEQILU0_DUNE(float, 6);
--- a/opm/simulators/linalg/gpuistl/GpuSeqILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuSeqILU0.hpp
@ -16,19 +16,19 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUSEQILU0_HPP
-#define OPM_CUSEQILU0_HPP
+#ifndef OPM_GPUSEQILU0_HPP
+#define OPM_GPUSEQILU0_HPP

 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuMatrixDescription.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseResource.hpp>



-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! \brief Sequential ILU0 preconditioner on the GPU through the CuSparse library.
 //!
@ -43,10 +43,10 @@ namespace Opm::cuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
-class CuSeqILU0 : public Dune::PreconditionerWithUpdate<X, Y>
+class GpuSeqILU0 : public Dune::PreconditionerWithUpdate<X, Y>
 {
 public:
    //! \brief The matrix type the preconditioner is for.
@ -64,7 +64,7 @@ public:
    //! \param A The matrix to operate on.
    //! \param w The relaxation factor.
    //!
-    CuSeqILU0(const M& A, field_type w);
+    GpuSeqILU0(const M& A, field_type w);

    //! \brief Prepare the preconditioner.
    //! \note Does nothing at the time being.
@ -110,18 +110,18 @@ private:
    //! This is the storage for the LU composition.
    //! Initially this will have the values of A, but will be
    //! modified in the constructor to be the proper LU decomposition.
-    CuSparseMatrix<field_type> m_LU;
+    GpuSparseMatrix<field_type> m_LU;

-    CuVector<field_type> m_temporaryStorage;
+    GpuVector<field_type> m_temporaryStorage;


-    detail::CuSparseMatrixDescriptionPtr m_descriptionL;
-    detail::CuSparseMatrixDescriptionPtr m_descriptionU;
+    detail::GpuSparseMatrixDescriptionPtr m_descriptionL;
+    detail::GpuSparseMatrixDescriptionPtr m_descriptionU;
    detail::CuSparseResource<bsrsv2Info_t> m_infoL;
    detail::CuSparseResource<bsrsv2Info_t> m_infoU;
    detail::CuSparseResource<bsrilu02Info_t> m_infoM;

-    std::unique_ptr<CuVector<field_type>> m_buffer;
+    std::unique_ptr<GpuVector<field_type>> m_buffer;
    detail::CuSparseHandle& m_cuSparseHandle;

    bool m_analysisDone = false;
@ -133,6 +133,6 @@ private:

    void updateILUConfiguration();
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
@ -22,14 +22,14 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/bvector.hh>
 #include <fmt/core.h>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_constants.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_wrapper.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <type_traits>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 namespace
@ -61,7 +61,7 @@ namespace


 template <class T>
-CuSparseMatrix<T>::CuSparseMatrix(const T* nonZeroElements,
+GpuSparseMatrix<T>::GpuSparseMatrix(const T* nonZeroElements,
                                  const int* rowIndices,
                                  const int* columnIndices,
                                  size_t numberOfNonzeroBlocks,
@ -82,15 +82,15 @@ CuSparseMatrix<T>::CuSparseMatrix(const T* nonZeroElements,
 }

 template <class T>
-CuSparseMatrix<T>::~CuSparseMatrix()
+GpuSparseMatrix<T>::~GpuSparseMatrix()
 {
    // empty
 }

 template <typename T>
 template <typename MatrixType>
-CuSparseMatrix<T>
-CuSparseMatrix<T>::fromMatrix(const MatrixType& matrix, bool copyNonZeroElementsDirectly)
+GpuSparseMatrix<T>
+GpuSparseMatrix<T>::fromMatrix(const MatrixType& matrix, bool copyNonZeroElementsDirectly)
 {
    // TODO: Do we need this intermediate storage? Or this shuffling of data?
    std::vector<int> columnIndices;
@ -129,18 +129,18 @@ CuSparseMatrix<T>::fromMatrix(const MatrixType& matrix, bool copyNonZeroElements
    // Sanity check
    // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
    OPM_ERROR_IF(rowIndices[matrix.N()] != detail::to_int(matrix.nonzeroes()),
-                 "Error size of rows do not sum to number of nonzeroes in CuSparseMatrix.");
-    OPM_ERROR_IF(rowIndices.size() != numberOfRows + 1, "Row indices do not match for CuSparseMatrix.");
-    OPM_ERROR_IF(columnIndices.size() != numberOfNonzeroBlocks, "Column indices do not match for CuSparseMatrix.");
+                 "Error size of rows do not sum to number of nonzeroes in GpuSparseMatrix.");
+    OPM_ERROR_IF(rowIndices.size() != numberOfRows + 1, "Row indices do not match for GpuSparseMatrix.");
+    OPM_ERROR_IF(columnIndices.size() != numberOfNonzeroBlocks, "Column indices do not match for GpuSparseMatrix.");


    if (copyNonZeroElementsDirectly) {
        const T* nonZeroElements = nonZeroElementsTmp;
-        return CuSparseMatrix<T>(
+        return GpuSparseMatrix<T>(
            nonZeroElements, rowIndices.data(), columnIndices.data(), numberOfNonzeroBlocks, blockSize, numberOfRows);
    } else {
        auto nonZeroElementData = extractNonzeroValues<T>(matrix);
-        return CuSparseMatrix<T>(nonZeroElementData.data(),
+        return GpuSparseMatrix<T>(nonZeroElementData.data(),
                                 rowIndices.data(),
                                 columnIndices.data(),
                                 numberOfNonzeroBlocks,
@ -152,7 +152,7 @@ CuSparseMatrix<T>::fromMatrix(const MatrixType& matrix, bool copyNonZeroElements
 template <class T>
 template <class MatrixType>
 void
-CuSparseMatrix<T>::updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly)
+GpuSparseMatrix<T>::updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly)
 {
    OPM_ERROR_IF(nonzeroes() != matrix.nonzeroes(), "Matrix does not have the same number of non-zero elements.");
    OPM_ERROR_IF(matrix[0][0].N() != blockSize(), "Matrix does not have the same blocksize.");
@ -170,42 +170,42 @@ CuSparseMatrix<T>::updateNonzeroValues(const MatrixType& matrix, bool copyNonZer

 template <typename T>
 void
-CuSparseMatrix<T>::setUpperTriangular()
+GpuSparseMatrix<T>::setUpperTriangular()
 {
    OPM_CUSPARSE_SAFE_CALL(cusparseSetMatFillMode(m_matrixDescription->get(), CUSPARSE_FILL_MODE_UPPER));
 }

 template <typename T>
 void
-CuSparseMatrix<T>::setLowerTriangular()
+GpuSparseMatrix<T>::setLowerTriangular()
 {
    OPM_CUSPARSE_SAFE_CALL(cusparseSetMatFillMode(m_matrixDescription->get(), CUSPARSE_FILL_MODE_LOWER));
 }

 template <typename T>
 void
-CuSparseMatrix<T>::setUnitDiagonal()
+GpuSparseMatrix<T>::setUnitDiagonal()
 {
    OPM_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(m_matrixDescription->get(), CUSPARSE_DIAG_TYPE_UNIT));
 }

 template <typename T>
 void
-CuSparseMatrix<T>::setNonUnitDiagonal()
+GpuSparseMatrix<T>::setNonUnitDiagonal()
 {
    OPM_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(m_matrixDescription->get(), CUSPARSE_DIAG_TYPE_NON_UNIT));
 }

 template <typename T>
 void
-CuSparseMatrix<T>::mv(const CuVector<T>& x, CuVector<T>& y) const
+GpuSparseMatrix<T>::mv(const GpuVector<T>& x, GpuVector<T>& y) const
 {
    assertSameSize(x);
    assertSameSize(y);
    if (blockSize() < 2u) {
        OPM_THROW(
            std::invalid_argument,
-            "CuSparseMatrix<T>::usmv and CuSparseMatrix<T>::mv are only implemented for block sizes greater than 1.");
+            "GpuSparseMatrix<T>::usmv and GpuSparseMatrix<T>::mv are only implemented for block sizes greater than 1.");
    }
    const auto nonzeroValues = getNonZeroValues().data();

@ -232,14 +232,14 @@ CuSparseMatrix<T>::mv(const CuVector<T>& x, CuVector<T>& y) const

 template <typename T>
 void
-CuSparseMatrix<T>::umv(const CuVector<T>& x, CuVector<T>& y) const
+GpuSparseMatrix<T>::umv(const GpuVector<T>& x, GpuVector<T>& y) const
 {
    assertSameSize(x);
    assertSameSize(y);
    if (blockSize() < 2u) {
        OPM_THROW(
            std::invalid_argument,
-            "CuSparseMatrix<T>::usmv and CuSparseMatrix<T>::mv are only implemented for block sizes greater than 1.");
+            "GpuSparseMatrix<T>::usmv and GpuSparseMatrix<T>::mv are only implemented for block sizes greater than 1.");
    }

    const auto nonzeroValues = getNonZeroValues().data();
@ -267,14 +267,14 @@ CuSparseMatrix<T>::umv(const CuVector<T>& x, CuVector<T>& y) const

 template <typename T>
 void
-CuSparseMatrix<T>::usmv(T alpha, const CuVector<T>& x, CuVector<T>& y) const
+GpuSparseMatrix<T>::usmv(T alpha, const GpuVector<T>& x, GpuVector<T>& y) const
 {
    assertSameSize(x);
    assertSameSize(y);
    if (blockSize() < 2) {
        OPM_THROW(
            std::invalid_argument,
-            "CuSparseMatrix<T>::usmv and CuSparseMatrix<T>::mv are only implemented for block sizes greater than 1.");
+            "GpuSparseMatrix<T>::usmv and GpuSparseMatrix<T>::mv are only implemented for block sizes greater than 1.");
    }
    const auto numberOfRows = N();
    const auto numberOfNonzeroBlocks = nonzeroes();
@ -304,7 +304,7 @@ CuSparseMatrix<T>::usmv(T alpha, const CuVector<T>& x, CuVector<T>& y) const
 template <class T>
 template <class VectorType>
 void
-CuSparseMatrix<T>::assertSameSize(const VectorType& x) const
+GpuSparseMatrix<T>::assertSameSize(const VectorType& x) const
 {
    if (x.dim() != blockSize() * N()) {
        OPM_THROW(std::invalid_argument,
@ -317,17 +317,17 @@ CuSparseMatrix<T>::assertSameSize(const VectorType& x) const


 #define INSTANTIATE_CUSPARSE_DUNE_MATRIX_CONSTRUCTION_FUNTIONS(realtype, blockdim)                                     \
-    template CuSparseMatrix<realtype> CuSparseMatrix<realtype>::fromMatrix(                                            \
+    template GpuSparseMatrix<realtype> GpuSparseMatrix<realtype>::fromMatrix(                                            \
        const Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>&, bool);                               \
-    template CuSparseMatrix<realtype> CuSparseMatrix<realtype>::fromMatrix(                                            \
+    template GpuSparseMatrix<realtype> GpuSparseMatrix<realtype>::fromMatrix(                                            \
        const Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>&, bool);                                \
-    template void CuSparseMatrix<realtype>::updateNonzeroValues(                                                       \
+    template void GpuSparseMatrix<realtype>::updateNonzeroValues(                                                       \
        const Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>&, bool);                               \
-    template void CuSparseMatrix<realtype>::updateNonzeroValues(                                                       \
+    template void GpuSparseMatrix<realtype>::updateNonzeroValues(                                                       \
        const Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>&, bool)

-template class CuSparseMatrix<float>;
-template class CuSparseMatrix<double>;
+template class GpuSparseMatrix<float>;
+template class GpuSparseMatrix<double>;

 INSTANTIATE_CUSPARSE_DUNE_MATRIX_CONSTRUCTION_FUNTIONS(double, 1);
 INSTANTIATE_CUSPARSE_DUNE_MATRIX_CONSTRUCTION_FUNTIONS(double, 2);
@ -343,4 +343,4 @@ INSTANTIATE_CUSPARSE_DUNE_MATRIX_CONSTRUCTION_FUNTIONS(float, 4);
 INSTANTIATE_CUSPARSE_DUNE_MATRIX_CONSTRUCTION_FUNTIONS(float, 5);
 INSTANTIATE_CUSPARSE_DUNE_MATRIX_CONSTRUCTION_FUNTIONS(float, 6);

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
@ -16,23 +16,23 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUSPARSEMATRIX_HPP
-#define OPM_CUSPARSEMATRIX_HPP
+#ifndef OPM_GPUSPARSEMATRIX_HPP
+#define OPM_GPUSPARSEMATRIX_HPP
 #include <cusparse.h>
 #include <iostream>
 #include <memory>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuMatrixDescription.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>
 #include <vector>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 /**
- * @brief The CuSparseMatrix class simple wrapper class for a CuSparse matrix.
+ * @brief The GpuSparseMatrix class simple wrapper class for a CuSparse matrix.
 *
 * @note we currently only support simple raw primitives for T (double and float). Block size is handled through the
 * block size parameter
@ -44,7 +44,7 @@ namespace Opm::cuistl
 * @note We only support Block Compressed Sparse Row Format (BSR) for now.
 */
 template <typename T>
-class CuSparseMatrix
+class GpuSparseMatrix
 {
 public:
    //! Create the sparse matrix specified by the raw data.
@ -60,7 +60,7 @@ public:
    //!
    //! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to
    //!       restrictions in the current version of cusparse. This might change in future versions.
-    CuSparseMatrix(const T* nonZeroElements,
+    GpuSparseMatrix(const T* nonZeroElements,
                   const int* rowIndices,
                   const int* columnIndices,
                   size_t numberOfNonzeroBlocks,
@ -70,14 +70,14 @@ public:
    /**
     * We don't want to be able to copy this for now (too much hassle in copying the cusparse resources)
     */
-    CuSparseMatrix(const CuSparseMatrix&) = delete;
+    GpuSparseMatrix(const GpuSparseMatrix&) = delete;

    /**
     * We don't want to be able to copy this for now (too much hassle in copying the cusparse resources)
     */
-    CuSparseMatrix& operator=(const CuSparseMatrix&) = delete;
+    GpuSparseMatrix& operator=(const GpuSparseMatrix&) = delete;

-    virtual ~CuSparseMatrix();
+    virtual ~GpuSparseMatrix();

    /**
     * @brief fromMatrix creates a new matrix with the same block size and values as the given matrix
@ -89,7 +89,7 @@ public:
     * @tparam MatrixType is assumed to be a Dune::BCRSMatrix compatible matrix.
     */
    template <class MatrixType>
-    static CuSparseMatrix<T> fromMatrix(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false);
+    static GpuSparseMatrix<T> fromMatrix(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false);

    /**
     * @brief setUpperTriangular sets the CuSparse flag that this is an upper diagonal (with unit diagonal) matrix.
@ -144,7 +144,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    CuVector<T>& getNonZeroValues()
+    GpuVector<T>& getNonZeroValues()
    {
        return m_nonZeroElements;
    }
@ -154,7 +154,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    const CuVector<T>& getNonZeroValues() const
+    const GpuVector<T>& getNonZeroValues() const
    {
        return m_nonZeroElements;
    }
@ -164,7 +164,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    CuVector<int>& getRowIndices()
+    GpuVector<int>& getRowIndices()
    {
        return m_rowIndices;
    }
@ -174,7 +174,7 @@ public:
     *
     * @note Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    const CuVector<int>& getRowIndices() const
+    const GpuVector<int>& getRowIndices() const
    {
        return m_rowIndices;
    }
@ -184,7 +184,7 @@ public:
     *
     * @return Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    CuVector<int>& getColumnIndices()
+    GpuVector<int>& getColumnIndices()
    {
        return m_columnIndices;
    }
@ -194,7 +194,7 @@ public:
     *
     * @return Read the CuSPARSE documentation on Block Compressed Sparse Row Format (BSR) for the exact ordering.
     */
-    const CuVector<int>& getColumnIndices() const
+    const GpuVector<int>& getColumnIndices() const
    {
        return m_columnIndices;
    }
@ -233,7 +233,7 @@ public:
     *
     * This description is needed for most calls to the CuSparse library
     */
-    detail::CuSparseMatrixDescription& getDescription()
+    detail::GpuSparseMatrixDescription& getDescription()
    {
        return *m_matrixDescription;
    }
@ -245,7 +245,7 @@ public:
     *
     * @note Due to limitations of CuSparse, this is only supported for block sizes greater than 1.
     */
-    virtual void mv(const CuVector<T>& x, CuVector<T>& y) const;
+    virtual void mv(const GpuVector<T>& x, GpuVector<T>& y) const;

    /**
     * @brief umv computes y=Ax+y
@ -254,7 +254,7 @@ public:
     *
     * @note Due to limitations of CuSparse, this is only supported for block sizes greater than 1.
     */
-    virtual void umv(const CuVector<T>& x, CuVector<T>& y) const;
+    virtual void umv(const GpuVector<T>& x, GpuVector<T>& y) const;


    /**
@ -264,7 +264,7 @@ public:
     *
     * @note Due to limitations of CuSparse, this is only supported for block sizes greater than 1.
     */
-    virtual void usmv(T alpha, const CuVector<T>& x, CuVector<T>& y) const;
+    virtual void usmv(T alpha, const GpuVector<T>& x, GpuVector<T>& y) const;

    /**
     * @brief updateNonzeroValues updates the non-zero values by using the non-zero values of the supplied matrix
@ -280,9 +280,9 @@ public:
    void updateNonzeroValues(const MatrixType& matrix, bool copyNonZeroElementsDirectly = false);

 private:
-    CuVector<T> m_nonZeroElements;
-    CuVector<int> m_columnIndices;
-    CuVector<int> m_rowIndices;
+    GpuVector<T> m_nonZeroElements;
+    GpuVector<int> m_columnIndices;
+    GpuVector<int> m_rowIndices;

    // Notice that we store these three as int to make sure we are cusparse compatible.
    //
@ -292,11 +292,11 @@ private:
    const int m_numberOfRows;
    const int m_blockSize;

-    detail::CuSparseMatrixDescriptionPtr m_matrixDescription;
+    detail::GpuSparseMatrixDescriptionPtr m_matrixDescription;
    detail::CuSparseHandle& m_cusparseHandle;

    template <class VectorType>
    void assertSameSize(const VectorType& vector) const;
 };
-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #endif
--- a/opm/simulators/linalg/gpuistl/GpuVector.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp
@ -20,41 +20,41 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <fmt/core.h>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cublas_wrapper.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/vector_operations.hpp>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class T>
-CuVector<T>::CuVector(const std::vector<T>& data)
-    : CuVector(data.data(), detail::to_int(data.size()))
+GpuVector<T>::GpuVector(const std::vector<T>& data)
+    : GpuVector(data.data(), detail::to_int(data.size()))
 {
 }

 template <class T>
-CuVector<T>::CuVector(const size_t numberOfElements)
+GpuVector<T>::GpuVector(const size_t numberOfElements)
    : m_numberOfElements(detail::to_int(numberOfElements))
    , m_cuBlasHandle(detail::CuBlasHandle::getInstance())
 {
-    OPM_CUDA_SAFE_CALL(cudaMalloc(&m_dataOnDevice, sizeof(T) * detail::to_size_t(m_numberOfElements)));
+    OPM_GPU_SAFE_CALL(cudaMalloc(&m_dataOnDevice, sizeof(T) * detail::to_size_t(m_numberOfElements)));
 }

 template <class T>
-CuVector<T>::CuVector(const T* dataOnHost, const size_t numberOfElements)
-    : CuVector(numberOfElements)
+GpuVector<T>::GpuVector(const T* dataOnHost, const size_t numberOfElements)
+    : GpuVector(numberOfElements)
 {

-    OPM_CUDA_SAFE_CALL(cudaMemcpy(
+    OPM_GPU_SAFE_CALL(cudaMemcpy(
        m_dataOnDevice, dataOnHost, detail::to_size_t(m_numberOfElements) * sizeof(T), cudaMemcpyHostToDevice));
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator=(T scalar)
+GpuVector<T>&
+GpuVector<T>::operator=(T scalar)
 {
    assertHasElements();
    detail::setVectorValue(data(), detail::to_size_t(m_numberOfElements), scalar);
@ -62,13 +62,13 @@ CuVector<T>::operator=(T scalar)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator=(const CuVector<T>& other)
+GpuVector<T>&
+GpuVector<T>::operator=(const GpuVector<T>& other)
 {
    assertHasElements();
    assertSameSize(other);

-    OPM_CUDA_SAFE_CALL(cudaMemcpy(m_dataOnDevice,
+    OPM_GPU_SAFE_CALL(cudaMemcpy(m_dataOnDevice,
                                  other.m_dataOnDevice,
                                  detail::to_size_t(m_numberOfElements) * sizeof(T),
                                  cudaMemcpyDeviceToDevice));
@ -76,33 +76,33 @@ CuVector<T>::operator=(const CuVector<T>& other)
 }

 template <class T>
-CuVector<T>::CuVector(const CuVector<T>& other)
-    : CuVector(other.m_numberOfElements)
+GpuVector<T>::GpuVector(const GpuVector<T>& other)
+    : GpuVector(other.m_numberOfElements)
 {
    assertHasElements();
    assertSameSize(other);
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(m_dataOnDevice,
+    OPM_GPU_SAFE_CALL(cudaMemcpy(m_dataOnDevice,
                                  other.m_dataOnDevice,
                                  detail::to_size_t(m_numberOfElements) * sizeof(T),
                                  cudaMemcpyDeviceToDevice));
 }

 template <class T>
-CuVector<T>::~CuVector()
+GpuVector<T>::~GpuVector()
 {
-    OPM_CUDA_WARN_IF_ERROR(cudaFree(m_dataOnDevice));
+    OPM_GPU_WARN_IF_ERROR(cudaFree(m_dataOnDevice));
 }

 template <typename T>
 const T*
-CuVector<T>::data() const
+GpuVector<T>::data() const
 {
    return m_dataOnDevice;
 }

 template <typename T>
-typename CuVector<T>::size_type
-CuVector<T>::dim() const
+typename GpuVector<T>::size_type
+GpuVector<T>::dim() const
 {
    // Note that there is no way for m_numberOfElements to be non-positive,
    // but for sanity we still use the safe conversion function here.
@ -114,7 +114,7 @@ CuVector<T>::dim() const

 template <typename T>
 std::vector<T>
-CuVector<T>::asStdVector() const
+GpuVector<T>::asStdVector() const
 {
    std::vector<T> temporary(detail::to_size_t(m_numberOfElements));
    copyToHost(temporary);
@ -123,21 +123,21 @@ CuVector<T>::asStdVector() const

 template <typename T>
 void
-CuVector<T>::setZeroAtIndexSet(const CuVector<int>& indexSet)
+GpuVector<T>::setZeroAtIndexSet(const GpuVector<int>& indexSet)
 {
    detail::setZeroAtIndexSet(m_dataOnDevice, indexSet.dim(), indexSet.data());
 }

 template <typename T>
 void
-CuVector<T>::assertSameSize(const CuVector<T>& x) const
+GpuVector<T>::assertSameSize(const GpuVector<T>& x) const
 {
    assertSameSize(x.m_numberOfElements);
 }

 template <typename T>
 void
-CuVector<T>::assertSameSize(int size) const
+GpuVector<T>::assertSameSize(int size) const
 {
    if (size != m_numberOfElements) {
        OPM_THROW(std::invalid_argument,
@ -147,7 +147,7 @@ CuVector<T>::assertSameSize(int size) const

 template <typename T>
 void
-CuVector<T>::assertHasElements() const
+GpuVector<T>::assertHasElements() const
 {
    if (m_numberOfElements <= 0) {
        OPM_THROW(std::invalid_argument, "We have 0 elements");
@ -156,14 +156,14 @@ CuVector<T>::assertHasElements() const

 template <typename T>
 T*
-CuVector<T>::data()
+GpuVector<T>::data()
 {
    return m_dataOnDevice;
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator*=(const T& scalar)
+GpuVector<T>&
+GpuVector<T>::operator*=(const T& scalar)
 {
    assertHasElements();
    OPM_CUBLAS_SAFE_CALL(detail::cublasScal(m_cuBlasHandle.get(), m_numberOfElements, &scalar, data(), 1));
@ -171,8 +171,8 @@ CuVector<T>::operator*=(const T& scalar)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::axpy(T alpha, const CuVector<T>& y)
+GpuVector<T>&
+GpuVector<T>::axpy(T alpha, const GpuVector<T>& y)
 {
    assertHasElements();
    assertSameSize(y);
@ -182,7 +182,7 @@ CuVector<T>::axpy(T alpha, const CuVector<T>& y)

 template <class T>
 T
-CuVector<T>::dot(const CuVector<T>& other) const
+GpuVector<T>::dot(const GpuVector<T>& other) const
 {
    assertHasElements();
    assertSameSize(other);
@ -193,7 +193,7 @@ CuVector<T>::dot(const CuVector<T>& other) const
 }
 template <class T>
 T
-CuVector<T>::two_norm() const
+GpuVector<T>::two_norm() const
 {
    assertHasElements();
    T result = T(0);
@ -203,14 +203,14 @@ CuVector<T>::two_norm() const

 template <typename T>
 T
-CuVector<T>::dot(const CuVector<T>& other, const CuVector<int>& indexSet, CuVector<T>& buffer) const
+GpuVector<T>::dot(const GpuVector<T>& other, const GpuVector<int>& indexSet, GpuVector<T>& buffer) const
 {
    return detail::innerProductAtIndices(m_cuBlasHandle.get(), m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data());
 }

 template <typename T>
 T
-CuVector<T>::two_norm(const CuVector<int>& indexSet, CuVector<T>& buffer) const
+GpuVector<T>::two_norm(const GpuVector<int>& indexSet, GpuVector<T>& buffer) const
 {
    // TODO: [perf] Optimize this to a single call
    return std::sqrt(this->dot(*this, indexSet, buffer));
@ -218,23 +218,23 @@ CuVector<T>::two_norm(const CuVector<int>& indexSet, CuVector<T>& buffer) const

 template <typename T>
 T
-CuVector<T>::dot(const CuVector<T>& other, const CuVector<int>& indexSet) const
+GpuVector<T>::dot(const GpuVector<T>& other, const GpuVector<int>& indexSet) const
 {
-    CuVector<T> buffer(indexSet.dim());
+    GpuVector<T> buffer(indexSet.dim());
    return detail::innerProductAtIndices(m_cuBlasHandle.get(), m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data());
 }

 template <typename T>
 T
-CuVector<T>::two_norm(const CuVector<int>& indexSet) const
+GpuVector<T>::two_norm(const GpuVector<int>& indexSet) const
 {
-    CuVector<T> buffer(indexSet.dim());
+    GpuVector<T> buffer(indexSet.dim());
    // TODO: [perf] Optimize this to a single call
    return std::sqrt(this->dot(*this, indexSet, buffer));
 }
 template <class T>
-CuVector<T>&
-CuVector<T>::operator+=(const CuVector<T>& other)
+GpuVector<T>&
+GpuVector<T>::operator+=(const GpuVector<T>& other)
 {
    assertHasElements();
    assertSameSize(other);
@ -243,8 +243,8 @@ CuVector<T>::operator+=(const CuVector<T>& other)
 }

 template <class T>
-CuVector<T>&
-CuVector<T>::operator-=(const CuVector<T>& other)
+GpuVector<T>&
+GpuVector<T>::operator-=(const GpuVector<T>& other)
 {
    assertHasElements();
    assertSameSize(other);
@ -255,7 +255,7 @@ CuVector<T>::operator-=(const CuVector<T>& other)

 template <class T>
 void
-CuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
 {
    if (numberOfElements > dim()) {
        OPM_THROW(std::runtime_error,
@ -263,45 +263,45 @@ CuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
                              dim(),
                              numberOfElements));
    }
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }

 template <class T>
 void
-CuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
+GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
 {
    assertSameSize(detail::to_int(numberOfElements));
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
 }

 template <class T>
 void
-CuVector<T>::copyFromHost(const std::vector<T>& data)
+GpuVector<T>::copyFromHost(const std::vector<T>& data)
 {
    copyFromHost(data.data(), data.size());
 }
 template <class T>
 void
-CuVector<T>::copyToHost(std::vector<T>& data) const
+GpuVector<T>::copyToHost(std::vector<T>& data) const
 {
    copyToHost(data.data(), data.size());
 }

 template <typename T>
 void
-CuVector<T>::prepareSendBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const
+GpuVector<T>::prepareSendBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const
 {
    return detail::prepareSendBuf(m_dataOnDevice, buffer.data(), indexSet.dim(), indexSet.data());
 }
 template <typename T>
 void
-CuVector<T>::syncFromRecvBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const
+GpuVector<T>::syncFromRecvBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const
 {
    return detail::syncFromRecvBuf(m_dataOnDevice, buffer.data(), indexSet.dim(), indexSet.data());
 }

-template class CuVector<double>;
-template class CuVector<float>;
-template class CuVector<int>;
+template class GpuVector<double>;
+template class GpuVector<float>;
+template class GpuVector<int>;

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/GpuVector.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp
@ -16,24 +16,24 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUVECTOR_HEADER_HPP
-#define OPM_CUVECTOR_HEADER_HPP
+#ifndef OPM_GPUVECTOR_HEADER_HPP
+#define OPM_GPUVECTOR_HEADER_HPP
 #include <dune/common/fvector.hh>
 #include <dune/istl/bvector.hh>
 #include <exception>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuBlasHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>
 #include <vector>
 #include <string>


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 /**
- * @brief The CuVector class is a simple (arithmetic) vector class for the GPU.
+ * @brief The GpuVector class is a simple (arithmetic) vector class for the GPU.
 *
 * @note we currently only support simple raw primitives for T (double, float and int)
 *
@ -45,12 +45,12 @@ namespace Opm::cuistl
 * Example usage:
 *
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+ * #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
 *
 * void someFunction() {
 *     auto someDataOnCPU = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
 *
- *     auto dataOnGPU = CuVector<double>(someDataOnCPU);
+ *     auto dataOnGPU = GpuVector<double>(someDataOnCPU);
 *
 *     // Multiply by 4.0:
 *     dataOnGPU *= 4.0;
@ -62,7 +62,7 @@ namespace Opm::cuistl
 * @tparam T the type to store. Can be either float, double or int.
 */
 template <typename T>
-class CuVector
+class GpuVector
 {
 public:
    using field_type = T;
@ -70,17 +70,17 @@ public:


    /**
-     * @brief CuVector allocates new GPU memory of the same size as other and copies the content of the other vector to
+     * @brief GpuVector allocates new GPU memory of the same size as other and copies the content of the other vector to
     * this newly allocated memory.
     *
     * @note This does synchronous transfer.
     *
     * @param other the vector to copy from
     */
-    CuVector(const CuVector<T>& other);
+    GpuVector(const GpuVector<T>& other);

    /**
-     * @brief CuVector allocates new GPU memory of the same size as data and copies the content of the data vector to
+     * @brief GpuVector allocates new GPU memory of the same size as data and copies the content of the data vector to
     * this newly allocated memory.
     *
     * @note This does CPU to GPU transfer.
@ -90,7 +90,7 @@ public:
     *
     * @param data the vector to copy from
     */
-    explicit CuVector(const std::vector<T>& data);
+    explicit GpuVector(const std::vector<T>& data);

    /**
     * @brief operator= copies the content of the data vector to the memory of this vector.
@ -100,7 +100,7 @@ public:
     *
     * @param other the vector to copy from
     */
-    CuVector& operator=(const CuVector<T>& other);
+    GpuVector& operator=(const GpuVector<T>& other);

    /**
     * @brief operator= sets the whole vector equal to the scalar value.
@ -109,20 +109,20 @@ public:
     *
     * @param scalar the value all elements will be set to.
     */
-    CuVector& operator=(T scalar);
+    GpuVector& operator=(T scalar);

    /**
-     * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T)
+     * @brief GpuVector allocates new GPU memory of size numberOfElements * sizeof(T)
     *
     * @note For now numberOfElements needs to be within the limits of int due to restrictions in cublas
     *
     * @param numberOfElements number of T elements to allocate
     */
-    explicit CuVector(const size_t numberOfElements);
+    explicit GpuVector(const size_t numberOfElements);


    /**
-     * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
+     * @brief GpuVector allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
     * data
     *
     * @note This assumes the data is on the CPU.
@ -132,12 +132,12 @@ public:
     *
     * @note For now numberOfElements needs to be within the limits of int due to restrictions in cublas
     */
-    CuVector(const T* dataOnHost, const size_t numberOfElements);
+    GpuVector(const T* dataOnHost, const size_t numberOfElements);

    /**
-     * @brief ~CuVector calls cudaFree
+     * @brief ~GpuVector calls cudaFree
     */
-    virtual ~CuVector();
+    virtual ~GpuVector();

    /**
     * @return the raw pointer to the GPU data
@ -162,7 +162,7 @@ public:
        // TODO: [perf] vector.dim() can be replaced by bvector.N() * BlockDimension
        if (detail::to_size_t(m_numberOfElements) != bvector.dim()) {
            OPM_THROW(std::runtime_error,
-                      fmt::format("Given incompatible vector size. CuVector has size {}, \n"
+                      fmt::format("Given incompatible vector size. GpuVector has size {}, \n"
                                  "however, BlockVector has N() = {}, and dim = {}.",
                                  m_numberOfElements,
                                  bvector.N(),
@ -185,7 +185,7 @@ public:
        // TODO: [perf] vector.dim() can be replaced by bvector.N() * BlockDimension
        if (detail::to_size_t(m_numberOfElements) != bvector.dim()) {
            OPM_THROW(std::runtime_error,
-                      fmt::format("Given incompatible vector size. CuVector has size {},\n however, the BlockVector "
+                      fmt::format("Given incompatible vector size. GpuVector has size {},\n however, the BlockVector "
                                  "has has N() = {}, and dim() = {}.",
                                  m_numberOfElements,
                                  bvector.N(),
@ -231,8 +231,8 @@ public:
     */
    void copyToHost(std::vector<T>& data) const;

-    void prepareSendBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const;
-    void syncFromRecvBuf(CuVector<T>& buffer, const CuVector<int>& indexSet) const;
+    void prepareSendBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const;
+    void syncFromRecvBuf(GpuVector<T>& buffer, const GpuVector<int>& indexSet) const;

    /**
     * @brief operator *= multiplies every element by scalar
@ -242,7 +242,7 @@ public:
     *
     * @note int is not supported
     */
-    CuVector<T>& operator*=(const T& scalar);
+    GpuVector<T>& operator*=(const T& scalar);

    /**
     * @brief axpy sets this vector equal to this + alha * y
@ -252,7 +252,7 @@ public:
     * @note this will call CuBlas in the background
     * @note int is not supported
     */
-    CuVector<T>& axpy(T alpha, const CuVector<T>& y);
+    GpuVector<T>& axpy(T alpha, const GpuVector<T>& y);

    /**
     * @brief operator+= adds the other vector to this vector
@ -260,7 +260,7 @@ public:
     * @note this will call CuBlas in the background
     * @note int is not supported
     */
-    CuVector<T>& operator+=(const CuVector<T>& other);
+    GpuVector<T>& operator+=(const GpuVector<T>& other);

    /**
     * @brief operator-= subtracts the other vector from this vector
@ -268,7 +268,7 @@ public:
     * @note this will call CuBlas in the background
     * @note int is not supported
     */
-    CuVector<T>& operator-=(const CuVector<T>& other);
+    GpuVector<T>& operator-=(const GpuVector<T>& other);

    /**
     * @brief dot computes the dot product (standard inner product) against the other vector
@ -278,7 +278,7 @@ public:
     *
     * @return the result on the inner product
     */
-    T dot(const CuVector<T>& other) const;
+    T dot(const GpuVector<T>& other) const;

    /**
     * @brief returns the l2 norm of the vector
@ -294,14 +294,14 @@ public:
     *
     * @note int is not supported
     */
-    T dot(const CuVector<T>& other, const CuVector<int>& indexSet, CuVector<T>& buffer) const;
+    T dot(const GpuVector<T>& other, const GpuVector<int>& indexSet, GpuVector<T>& buffer) const;

    /**
     * Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]])
     *
     * @note int is not supported
     */
-    T two_norm(const CuVector<int>& indexSet, CuVector<T>& buffer) const;
+    T two_norm(const GpuVector<int>& indexSet, GpuVector<T>& buffer) const;


    /**
@ -309,14 +309,14 @@ public:
     *
     * @note int is not supported
     */
-    T dot(const CuVector<T>& other, const CuVector<int>& indexSet) const;
+    T dot(const GpuVector<T>& other, const GpuVector<int>& indexSet) const;

    /**
     * Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]])
     *
     * @note int is not supported
     */
-    T two_norm(const CuVector<int>& indexSet) const;
+    T two_norm(const GpuVector<int>& indexSet) const;


    /**
@ -363,9 +363,9 @@ public:
     * }
     * @endcode
     */
-    void setZeroAtIndexSet(const CuVector<int>& indexSet);
+    void setZeroAtIndexSet(const GpuVector<int>& indexSet);

-    // Slow method that creates a string representation of a CuVector for debug purposes
+    // Slow method that creates a string representation of a GpuVector for debug purposes
    std::string toDebugString()
    {
        std::vector<T> v = asStdVector();
@ -385,11 +385,11 @@ private:
    const int m_numberOfElements;
    detail::CuBlasHandle& m_cuBlasHandle;

-    void assertSameSize(const CuVector<T>& other) const;
+    void assertSameSize(const GpuVector<T>& other) const;
    void assertSameSize(int size) const;

    void assertHasElements() const;
 };

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #endif
--- a/opm/simulators/linalg/gpuistl/GpuView.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuView.cpp
@ -20,21 +20,21 @@
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <fmt/core.h>
-#include <opm/simulators/linalg/cuistl/CuView.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class T>
-CuView<T>::CuView(std::vector<T>& data)
-    : CuView(data.data(), data.size())
+GpuView<T>::GpuView(std::vector<T>& data)
+    : GpuView(data.data(), data.size())
 {
 }

 template <typename T>
 std::vector<T>
-CuView<T>::asStdVector() const
+GpuView<T>::asStdVector() const
 {
    std::vector<T> temporary(m_numberOfElements);
    copyToHost(temporary);
@ -43,7 +43,7 @@ CuView<T>::asStdVector() const

 template <class T>
 void
-CuView<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
+GpuView<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
 {
    if (numberOfElements > size()) {
        OPM_THROW(std::runtime_error,
@ -51,32 +51,32 @@ CuView<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
                              size(),
                              numberOfElements));
    }
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }

 template <class T>
 void
-CuView<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
+GpuView<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
 {
    assertSameSize(numberOfElements);
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
 }

 template <class T>
 void
-CuView<T>::copyFromHost(const std::vector<T>& data)
+GpuView<T>::copyFromHost(const std::vector<T>& data)
 {
    copyFromHost(data.data(), data.size());
 }
 template <class T>
 void
-CuView<T>::copyToHost(std::vector<T>& data) const
+GpuView<T>::copyToHost(std::vector<T>& data) const
 {
    copyToHost(data.data(), data.size());
 }

-template class CuView<double>;
-template class CuView<float>;
-template class CuView<int>;
+template class GpuView<double>;
+template class GpuView<float>;
+template class GpuView<int>;

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/GpuView.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuView.hpp
@ -16,14 +16,14 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUVIEW_HEADER_HPP
-#define OPM_CUVIEW_HEADER_HPP
+#ifndef OPM_GPUVIEW_HEADER_HPP
+#define OPM_GPUVIEW_HEADER_HPP

 #include <dune/common/fvector.hh>

 #include <opm/common/ErrorMacros.hpp>

-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>

 #include <stdexcept>
 #include <vector>
@ -37,37 +37,37 @@
 #define OPM_IS_INSIDE_DEVICE_FUNCTION_TEMPORARY 0
 #endif

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 /**
- * @brief The CuView class is provides a view of some data allocated on the GPU
+ * @brief The GpuView class is provides a view of some data allocated on the GPU
 * Essenstially is only stores a pointer and a size.
 *
 *  This class supports being used from inside a CUDA/HIP Kernel.
 *  Implementations are placed in this headerfile for functions that may be called
 *  inside a kernel to avoid expensive RDC (relocatable device code)
 *
- * The view will typically provide a view into a CuBuffer and be able to
+ * The view will typically provide a view into a GpuBuffer and be able to
 * manipulate the data within it
 *
 * @param T Type of the data we store, typically int/float/double w/o const specifier
 *
 **/
 template <typename T>
-class CuView
+class GpuView
 {
 public:
    using value_type = T;
    /**
     * @brief Default constructor that will initialize cublas and allocate 0 bytes of memory
     */
-    explicit CuView() = default;
+    explicit GpuView() = default;

    //TODO: we probably dont need anything like this or is it useful to have views also be able to handle things on CPU?
    /// @brief constructor based on std::vectors, this will make a view on the CPU
    /// @param data std vector to pr
-    CuView(std::vector<T>& data);
+    GpuView(std::vector<T>& data);

    /**
     * @brief operator[] to retrieve a reference to an item in the buffer
@ -95,7 +95,7 @@ public:


    /**
-     * @brief CuView allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
+     * @brief GpuView allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
     * data
     *
     * @note This assumes the data is on the CPU.
@ -103,15 +103,15 @@ public:
     * @param numberOfElements number of T elements to allocate
     * @param dataOnHost data on host/CPU
     */
-    __host__ __device__ CuView(T* dataOnHost, size_t numberOfElements)
+    __host__ __device__ GpuView(T* dataOnHost, size_t numberOfElements)
        : m_dataPtr(dataOnHost), m_numberOfElements(numberOfElements)
    {
    }

    /**
-     * @brief ~CuView calls cudaFree
+     * @brief ~GpuView calls cudaFree
     */
-    ~CuView() = default;
+    ~GpuView() = default;

    /**
     * @return the raw pointer to the GPU data
@ -128,7 +128,7 @@ public:
    }

    /**
-     * @return fetch the first element in a CuView
+     * @return fetch the first element in a GpuView
     */
    __host__ __device__ T& front()
    {
@ -139,7 +139,7 @@ public:
    }

    /**
-     * @return fetch the last element in a CuView
+     * @return fetch the last element in a GpuView
     */
    __host__ __device__ T& back()
    {
@ -150,7 +150,7 @@ public:
    }

    /**
-     * @return fetch the first element in a CuView
+     * @return fetch the first element in a GpuView
     */
    __host__ __device__ T front() const
    {
@ -161,7 +161,7 @@ public:
    }

    /**
-     * @return fetch the last element in a CuView
+     * @return fetch the last element in a GpuView
     */
    __host__ __device__ T back() const
    {
@ -220,7 +220,7 @@ public:
     * @return an std::vector containing the elements copied from the GPU.
     */
    std::vector<T> asStdVector() const;
-    /// @brief Iterator class to make CuViews more similar to std containers
+    /// @brief Iterator class to make GpuViews more similar to std containers
    class iterator {
    public:
        // Iterator typedefs
@ -363,7 +363,7 @@ private:

    /// @brief Helper function to assert if another view has the same size
    /// @param other view
-    __host__ __device__ void assertSameSize(const CuView<T>& other) const
+    __host__ __device__ void assertSameSize(const GpuView<T>& other) const
    {
        assertSameSize(other.m_numberOfElements);
    }
@ -410,6 +410,6 @@ private:
    }
 };

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/OpmCuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmCuILU0.cpp
@ -26,18 +26,18 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
-#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
-#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/OpmCuILU0.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/autotuner.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/coloringAndReorderingUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <string>
 #include <tuple>
 #include <utility>
-namespace Opm::cuistl
+namespace Opm::gpuistl
 {

 template <class M, class X, class Y, int l>
@ -46,7 +46,7 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
    , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
-    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
+    , m_gpuMatrix(GpuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
    , m_gpuMatrixReorderedLower(nullptr)
    , m_gpuMatrixReorderedUpper(nullptr)
    , m_gpuNaturalToReorder(m_naturalToReordered)
@ -72,12 +72,12 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
-        m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
+        m_gpuMatrixReorderedDiag.emplace(GpuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
-            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
+            = detail::extractLowerAndUpperMatrices<M, field_type, GpuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
    } else {
-        m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
+        m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, GpuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
@ -272,8 +272,8 @@ OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
        = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing ILU0 factorization");

    // tune the thread-block size of the apply
-    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
-    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    GpuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize) {
@ -289,14 +289,14 @@ OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
        tuneUpperSolveThreadBlockSizeInApply, "Kernel computing an upper triangular solve for a level set");
 }

-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
-    template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
-                                            ::Opm::cuistl::CuVector<realtype>,                                         \
-                                            ::Opm::cuistl::CuVector<realtype>>;                                        \
-    template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
-                                            ::Opm::cuistl::CuVector<realtype>,                                         \
-                                            ::Opm::cuistl::CuVector<realtype>>
+    template class ::Opm::gpuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>;                                        \
+    template class ::Opm::gpuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
+                                            ::Opm::gpuistl::GpuVector<realtype>,                                         \
+                                            ::Opm::gpuistl::GpuVector<realtype>>

 INSTANTIATE_CUDILU_DUNE(double, 1);
 INSTANTIATE_CUDILU_DUNE(double, 2);
--- a/opm/simulators/linalg/gpuistl/OpmCuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmCuILU0.hpp
@ -22,14 +22,14 @@
 #include <memory>
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
 #include <optional>
 #include <type_traits>
 #include <vector>


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! \brief ILU0 preconditioner on the GPU.
 //!
@ -39,7 +39,7 @@ namespace Opm::cuistl
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
-//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! \note We assume X and Y are both GpuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
 class OpmCuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
@ -54,7 +54,7 @@ public:
    //! \brief The field type of the preconditioner.
    using field_type = typename X::field_type;
    //! \brief The GPU matrix type
-    using CuMat = CuSparseMatrix<field_type>;
+    using CuMat = GpuSparseMatrix<field_type>;

    //! \brief Constructor.
    //!
@ -126,13 +126,13 @@ private:
    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
    //! \brief If matrix splitting is enabled, we also store the diagonal separately
-    std::optional<CuVector<field_type>> m_gpuMatrixReorderedDiag;
+    std::optional<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
    //! row conversion from natural to reordered matrix indices stored on the GPU
-    CuVector<int> m_gpuNaturalToReorder;
+    GpuVector<int> m_gpuNaturalToReorder;
    //! row conversion from reordered to natural matrix indices stored on the GPU
-    CuVector<int> m_gpuReorderToNatural;
+    GpuVector<int> m_gpuReorderToNatural;
    //! \brief Stores the inverted diagonal that we use in ILU0
-    CuVector<field_type> m_gpuDInv;
+    GpuVector<field_type> m_gpuDInv;
    //! \brief Bool storing whether or not we should store matrices in a split format
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
@ -144,6 +144,6 @@ private:
    int m_moveThreadBlockSize = -1;
    int m_ILU0FactorizationThreadBlockSize = -1;
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp
+++ b/opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp
@ -21,12 +21,12 @@
 #include <cusparse.h>
 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerHolder.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerHolder.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditioner_should_call_post_pre.hpp>


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //!\brief Makes a CUDA preconditioner available to a CPU simulator.
 //!
@ -35,11 +35,11 @@ namespace Opm::cuistl
 //!
 //! \tparam X the domain type (should be on the CPU). Typicall a Dune::BlockVector
 //! \tparam Y the range type (should be on the CPU). Typicall a Dune::BlockVector
-//! \tparam CudaPreconditionerType the preconditioner taking CuVector<real_type> as arguments to apply
+//! \tparam CudaPreconditionerType the preconditioner taking GpuVector<real_type> as arguments to apply
 template <class X, class Y, class CudaPreconditionerType>
 class PreconditionerAdapter
    : public Dune::PreconditionerWithUpdate<X, Y>,
-      public PreconditionerHolder<CuVector<typename X::field_type>, CuVector<typename Y::field_type>>
+      public PreconditionerHolder<GpuVector<typename X::field_type>, GpuVector<typename Y::field_type>>
 {
 public:
    //! \brief The domain type of the preconditioner.
@ -77,8 +77,8 @@ public:
    virtual void apply(X& v, const Y& d) override
    {
        if (!m_inputBuffer) {
-            m_inputBuffer.reset(new CuVector<field_type>(v.dim()));
-            m_outputBuffer.reset(new CuVector<field_type>(v.dim()));
+            m_inputBuffer.reset(new GpuVector<field_type>(v.dim()));
+            m_outputBuffer.reset(new GpuVector<field_type>(v.dim()));
        }
        m_inputBuffer->copyFromHost(d);
        m_underlyingPreconditioner->apply(*m_outputBuffer, *m_inputBuffer);
@ -117,7 +117,7 @@ public:
        return detail::shouldCallPreconditionerPre<CudaPreconditionerType>();
    }

-    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<CuVector<field_type>, CuVector<field_type>>>
+    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<GpuVector<field_type>, GpuVector<field_type>>>
    getUnderlyingPreconditioner() override
    {
        return m_underlyingPreconditioner;
@ -131,9 +131,9 @@ private:
    //! \brief the underlying preconditioner to use
    std::shared_ptr<CudaPreconditionerType> m_underlyingPreconditioner;

-    std::unique_ptr<CuVector<field_type>> m_inputBuffer;
-    std::unique_ptr<CuVector<field_type>> m_outputBuffer;
+    std::unique_ptr<GpuVector<field_type>> m_inputBuffer;
+    std::unique_ptr<GpuVector<field_type>> m_outputBuffer;
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/PreconditionerConvertFieldTypeAdapter.hpp
+++ b/opm/simulators/linalg/gpuistl/PreconditionerConvertFieldTypeAdapter.hpp
@ -22,16 +22,16 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/preconditioner.hh>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuMatrixDescription.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseResource.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_constants.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditioner_should_call_post_pre.hpp>


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! \brief Converts the field type (eg. double to float) to benchmark single precision preconditioners
 //!
@ -50,7 +50,7 @@ namespace Opm::cuistl
 //!
 //! To use this, use something like the following code:
 //! \code{.cpp}
-//! #include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
+//! #include <opm/simulators/linalg/gpuistl/PreconditionerConvertFieldTypeAdapter.hpp>
 //! #include <opm/simulators/linalg/ParallelOverlappingILU0.hpp>
 //!
 //! using XDouble = Dune::BlockVector<Dune::FieldVector<double, 2>>;
@ -64,7 +64,7 @@ namespace Opm::cuistl
 //! void applyILU0AsFloat(const MDouble& matrix, const XDouble& x, XDouble& y) {
 //!
 //!     using FloatILU0 = typename Opm::ParallelOverlappingILU0<MFloat, XFloat, XFloat, ParallelInfo>;
-//!     using DoubleToFloatConverter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<FloatILU0, MDouble,
+//!     using DoubleToFloatConverter = typename Opm::gpuistl::PreconditionerConvertFieldTypeAdapter<FloatILU0, MDouble,
 //!     XDouble, XDouble>;
 //!
 //!     // Note that we do not need to make a new instance for every invocation, this
@ -239,6 +239,6 @@ private:
    //! \brief the underlying preconditioner to use
    std::shared_ptr<CudaPreconditionerType> m_underlyingPreconditioner;
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/PreconditionerHolder.hpp
+++ b/opm/simulators/linalg/gpuistl/PreconditionerHolder.hpp
@ -21,7 +21,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>


-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! \brief Common interface for adapters that hold preconditioners.
 //!
@ -38,6 +38,6 @@ public:
     */
    virtual std::shared_ptr<Dune::PreconditionerWithUpdate<X, Y>> getUnderlyingPreconditioner() = 0;
 };
-} // end namespace Opm::cuistl
+} // end namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/SolverAdapter.hpp
+++ b/opm/simulators/linalg/gpuistl/SolverAdapter.hpp
@ -26,12 +26,12 @@
 #include <dune/istl/schwarz.hh>
 #include <dune/istl/solver.hh>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
-#include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/has_function.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuBlockPreconditioner.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuOwnerOverlapCopy.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/has_function.hpp>

 #ifdef OPEN_MPI
 #if OPEN_MPI
@ -39,7 +39,7 @@
 #endif
 #endif

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! @brief Wraps a CUDA solver to work with CPU data.
 //!
@ -56,7 +56,7 @@ public:
    using typename Dune::IterativeSolver<X, X>::real_type;
    using typename Dune::IterativeSolver<X, X>::scalar_real_type;
    static constexpr auto block_size = domain_type::block_type::dimension;
-    using XGPU = Opm::cuistl::CuVector<real_type>;
+    using XGPU = Opm::gpuistl::GpuVector<real_type>;

    // TODO: Use a std::forward
    SolverAdapter(Operator& op,
@ -67,7 +67,7 @@ public:
                  int verbose)
        : Dune::IterativeSolver<X, X>(op, sp, *prec, reduction, maxit, verbose)
        , m_opOnCPUWithMatrix(op)
-        , m_matrix(CuSparseMatrix<real_type>::fromMatrix(op.getmat()))
+        , m_matrix(GpuSparseMatrix<real_type>::fromMatrix(op.getmat()))
        , m_underlyingSolver(constructSolver(prec, reduction, maxit, verbose))
    {
    }
@ -116,7 +116,7 @@ public:

 private:
    Operator& m_opOnCPUWithMatrix;
-    CuSparseMatrix<real_type> m_matrix;
+    GpuSparseMatrix<real_type> m_matrix;

    UnderlyingSolver<XGPU> m_underlyingSolver;

@ -148,8 +148,8 @@ private:
            if (!precAsHolder) {
                OPM_THROW(std::invalid_argument,
                          "The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
-                          "Opm::cuistl::PreconditionerAdapter wrapped in a "
-                          "Opm::cuistl::CuBlockPreconditioner. If you are unsure what this means, set "
+                          "Opm::gpuistl::PreconditionerAdapter wrapped in a "
+                          "Opm::gpuistl::GpuBlockPreconditioner. If you are unsure what this means, set "
                          "preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
            }

@ -159,8 +159,8 @@ private:
            if (!preconditionerAdapterAsHolder) {
                OPM_THROW(std::invalid_argument,
                          "The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
-                          "Opm::cuistl::PreconditionerAdapter wrapped in a "
-                          "Opm::cuistl::CuBlockPreconditioner. If you are unsure what this means, set "
+                          "Opm::gpuistl::PreconditionerAdapter wrapped in a "
+                          "Opm::gpuistl::GpuBlockPreconditioner. If you are unsure what this means, set "
                          "preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
            }
            // We need to get the underlying preconditioner:
@ -183,20 +183,20 @@ private:


            // TODO add typename Operator communication type as a named type with using
-            std::shared_ptr<Opm::cuistl::GPUSender<real_type, typename Operator::communication_type>> gpuComm;
+            std::shared_ptr<Opm::gpuistl::GPUSender<real_type, typename Operator::communication_type>> gpuComm;
            if (mpiSupportsCudaAwareAtCompileTime && mpiSupportsCudaAwareAtRunTime){
-                gpuComm = std::make_shared<Opm::cuistl::GPUAwareMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
+                gpuComm = std::make_shared<Opm::gpuistl::GPUAwareMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
            }
            else{
-                gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
+                gpuComm = std::make_shared<Opm::gpuistl::GPUObliviousMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
            }

-            using CudaCommunication = CuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
+            using CudaCommunication = GpuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
            using SchwarzOperator
-                = Dune::OverlappingSchwarzOperator<CuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
+                = Dune::OverlappingSchwarzOperator<GpuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
            auto cudaCommunication = std::make_shared<CudaCommunication>(gpuComm);

-            auto mpiPreconditioner = std::make_shared<CuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
+            auto mpiPreconditioner = std::make_shared<GpuBlockPreconditioner<XGPU, XGPU, CudaCommunication>>(
                preconditionerReallyOnGPU, cudaCommunication);

            auto scalarProduct = std::make_shared<Dune::ParallelScalarProduct<XGPU, CudaCommunication>>(
@ -206,8 +206,8 @@ private:
            // NOTE: Ownsership of cudaCommunication is handled by mpiPreconditioner. However, just to make sure we
            // remember
            //       this, we add this check. So remember that we hold one count in this scope, and one in the
-            //       CuBlockPreconditioner instance. We accomedate for the fact that it could be passed around in
-            //       CuBlockPreconditioner, hence we do not test for != 2
+            //       GpuBlockPreconditioner instance. We accomedate for the fact that it could be passed around in
+            //       GpuBlockPreconditioner, hence we do not test for != 2
            OPM_ERROR_IF(cudaCommunication.use_count() < 2, "Internal error. Shared pointer not owned properly.");
            auto overlappingCudaOperator = std::make_shared<SchwarzOperator>(m_matrix, *cudaCommunication);

@ -222,12 +222,12 @@ private:
            if (!precAsHolder) {
                OPM_THROW(std::invalid_argument,
                          "The preconditioner needs to be a CUDA preconditioner wrapped in a "
-                          "Opm::cuistl::PreconditionerHolder (eg. CuILU0).");
+                          "Opm::gpuistl::PreconditionerHolder (eg. CuILU0).");
            }
            auto preconditionerOnGPU = precAsHolder->getUnderlyingPreconditioner();

            auto matrixOperator
-                = std::make_shared<Dune::MatrixAdapter<CuSparseMatrix<real_type>, XGPU, XGPU>>(m_matrix);
+                = std::make_shared<Dune::MatrixAdapter<GpuSparseMatrix<real_type>, XGPU, XGPU>>(m_matrix);
            auto scalarProduct = std::make_shared<Dune::SeqScalarProduct<XGPU>>();
            return UnderlyingSolver<XGPU>(
                matrixOperator, scalarProduct, preconditionerOnGPU, reduction, maxit, verbose);
@ -237,6 +237,6 @@ private:
    std::unique_ptr<XGPU> m_inputBuffer;
    std::unique_ptr<XGPU> m_outputBuffer;
 };
-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/CuBlasHandle.cpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuBlasHandle.cpp
@ -17,9 +17,9 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <cublas_v2.h>
-#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
-namespace Opm::cuistl::detail
+#include <opm/simulators/linalg/gpuistl/detail/CuBlasHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
+namespace Opm::gpuistl::detail
 {


@ -46,4 +46,4 @@ CuBlasHandle::getInstance()
    return instance;
 }

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
--- a/opm/simulators/linalg/gpuistl/detail/CuBlasHandle.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuBlasHandle.hpp
@ -21,7 +21,7 @@
 #include <cublas_v2.h>
 #include <memory>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -29,9 +29,9 @@ namespace Opm::cuistl::detail
 *
 * Example use:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/CuBlasHandle.hpp>
 * void someFunction() {
- *     auto& cublasHandle = ::Opm::cuistl::detail::CuBlasHandle::getInstance();
+ *     auto& cublasHandle = ::Opm::gpuistl::detail::CuBlasHandle::getInstance();
 *     int cuBlasVersion = -1;
 *     OPM_CUBLAS_SAFE_CALL(cublasGetVersion(cublasHandle.get(), &cuBlasVersion));
 * }
@ -64,5 +64,5 @@ private:
    CuBlasHandle();
    cublasHandle_t m_handle;
 };
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif // OPM_CUBLASHANDLE_HPP
--- a/opm/simulators/linalg/gpuistl/detail/CuMatrixDescription.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuMatrixDescription.hpp
@ -18,30 +18,30 @@
 */
 #ifndef CU_MATRIX_DESCRIPTION_HPP
 #define CU_MATRIX_DESCRIPTION_HPP
-#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseResource.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
- * CuSparseMatrixDescription holder. This is internal information needed for most calls to the CuSparse API.
+ * GpuSparseMatrixDescription holder. This is internal information needed for most calls to the CuSparse API.
 */
-using CuSparseMatrixDescription = CuSparseResource<cusparseMatDescr_t>;
+using GpuSparseMatrixDescription = CuSparseResource<cusparseMatDescr_t>;

 /**
- * Pointer to CuSparseMatrixDescription holder. This is internal information needed for most calls to the CuSparse API.
+ * Pointer to GpuSparseMatrixDescription holder. This is internal information needed for most calls to the CuSparse API.
 */
-using CuSparseMatrixDescriptionPtr = std::shared_ptr<CuSparseResource<cusparseMatDescr_t>>;
+using GpuSparseMatrixDescriptionPtr = std::shared_ptr<CuSparseResource<cusparseMatDescr_t>>;

 /**
 * @brief createMatrixDescription creates a default matrix description
 * @return a matrix description to a general sparse matrix with zero based indexing.
 */
-inline CuSparseMatrixDescriptionPtr
+inline GpuSparseMatrixDescriptionPtr
 createMatrixDescription()
 {
-    auto description = std::make_shared<CuSparseMatrixDescription>();
+    auto description = std::make_shared<GpuSparseMatrixDescription>();

    // Note: We always want to use zero base indexing.
    OPM_CUSPARSE_SAFE_CALL(cusparseSetMatType(description->get(), CUSPARSE_MATRIX_TYPE_GENERAL));
@ -52,11 +52,11 @@ createMatrixDescription()

 /**
 * @brief createLowerDiagonalDescription creates a lower diagonal matrix description
- * @return a lower diagonal matrix description overlapped with options from ::Opm::cuistl::detail::createMatrixDescription()
+ * @return a lower diagonal matrix description overlapped with options from ::Opm::gpuistl::detail::createMatrixDescription()
 *
 * @note This will assume it has a unit diagonal
 */
-inline CuSparseMatrixDescriptionPtr
+inline GpuSparseMatrixDescriptionPtr
 createLowerDiagonalDescription()
 {
    auto description = createMatrixDescription();
@ -67,11 +67,11 @@ createLowerDiagonalDescription()

 /**
 * @brief createUpperDiagonalDescription creates an upper diagonal matrix description
- * @return an upper diagonal matrix description overlapped with options from ::Opm::cuistl::detail::createMatrixDescription()
+ * @return an upper diagonal matrix description overlapped with options from ::Opm::gpuistl::detail::createMatrixDescription()
 *
 * @note This will assume it has a non-unit diagonal.
 */
-inline CuSparseMatrixDescriptionPtr
+inline GpuSparseMatrixDescriptionPtr
 createUpperDiagonalDescription()
 {
    auto description = createMatrixDescription();
@ -81,6 +81,6 @@ createUpperDiagonalDescription()
    return description;
 }

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 #endif // CU_MATRIX_DESCRIPTION_HPP
--- a/opm/simulators/linalg/gpuistl/detail/CuSparseHandle.cpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuSparseHandle.cpp
@ -16,9 +16,9 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
-namespace Opm::cuistl::detail
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
+namespace Opm::gpuistl::detail
 {


@ -46,4 +46,4 @@ CuSparseHandle::getInstance()
    return instance;
 }

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
--- a/opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp
@ -21,7 +21,7 @@
 #include <cusparse.h>
 #include <memory>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -29,9 +29,9 @@ namespace Opm::cuistl::detail
 *
 * Example use:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
 * void someFunction() {
- *     auto& cuSparseHandle = ::Opm::cuistl::detail::CuSparseHandle::getInstance();
+ *     auto& cuSparseHandle = ::Opm::gpuistl::detail::CuSparseHandle::getInstance();
 *     int cuSparseVersion = -1;
 *     OPM_CUSPARSE_SAFE_CALL(cusparseGetVersion(cuSparseHandle.get(), &cuSparseVersion));
 * }
@ -63,5 +63,5 @@ private:
    CuSparseHandle();
    cusparseHandle_t m_handle;
 };
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif // OPM_CUSPARSEHANDLE_HPP
--- a/opm/simulators/linalg/gpuistl/detail/CuSparseResource.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuSparseResource.hpp
@ -23,7 +23,7 @@
 #include <memory>
 #include <type_traits>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -43,7 +43,7 @@ namespace Opm::cuistl::detail
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulator/linalg/cuistl/detail/CuSparseResource.hpp>
+ * #include <opm/simulator/linalg/gpuistl/detail/CuSparseResource.hpp>
 *
 * void someFunction() {
 *     auto resource = CuSparseResource<cuSparseMatDescr_t>();
@ -94,6 +94,6 @@ private:
    DeleterType m_deleter;
 };

-} // namespace Opm::cuistl::impl
-#include <opm/simulators/linalg/cuistl/detail/CuSparseResource_impl.hpp>
+} // namespace Opm::gpuistl::impl
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseResource_impl.hpp>
 #endif // CUSPARSERESOURCE_HPP
--- a/opm/simulators/linalg/gpuistl/detail/CuSparseResource_impl.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/CuSparseResource_impl.hpp
@ -18,9 +18,9 @@
 */
 #include <exception>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 namespace
@ -100,4 +100,4 @@ CuSparseResource<T>::~CuSparseResource()
    // proper name of the function being called.
    OPM_CUSPARSE_WARN_IF_ERROR(m_deleter(m_resource));
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
--- a/opm/simulators/linalg/gpuistl/detail/autotuner.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/autotuner.hpp
@ -21,11 +21,11 @@
 #include <limits>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/OpmLog/OpmLog.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 #include <string>
 #include <utility>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /// @brief Function that tests the best thread block size, assumes the provided function depends on threadblock-size
@ -47,7 +47,7 @@ tuneThreadBlockSize(func& f, std::string descriptionOfFunction)

    // create the events
    for (int i = 0; i < runs + 1; ++i) {
-        OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
+        OPM_GPU_SAFE_CALL(cudaEventCreate(&events[i]));
    }

    // Initialize helper variables
@ -59,21 +59,21 @@ tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {

        // record a first event, and then an event after each kernel
-        OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
+        OPM_GPU_SAFE_CALL(cudaEventRecord(events[0]));
        for (int i = 0; i < runs; ++i) {
            f(thrBlockSize); // runs an arbitrary function with the provided arguments
-            OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i + 1]));
+            OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1]));
        }

        // make suret he runs are over
-        OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));
+        OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs]));

        // kernel launch was valid
        if (cudaSuccess == cudaGetLastError()) {
            // check if we beat the record for the fastest kernel
            for (int i = 0; i < runs; ++i) {
                float candidateBlockSizeTime;
-                OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
+                OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
                if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
                    bestTime = candidateBlockSizeTime;
                    bestBlockSize = thrBlockSize;
@ -88,6 +88,6 @@ tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
    return bestBlockSize;
 }

-} // end namespace Opm::cuistl::detail
+} // end namespace Opm::gpuistl::detail

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/coloringAndReorderingUtils.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/coloringAndReorderingUtils.hpp
@ -23,24 +23,24 @@
 #include <memory>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/grid/utility/SparseTable.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>
 #include <tuple>
 #include <vector>
 /*
 This file contains a collection of utility functions used in the GPU implementation of ILU and DILU
 The functions deal with creating the mappings between reordered and natural indices, as well as
-extracting sparsity structures from dune matrices and creating cusparsematrix indices
+extracting sparsity structures from dune matrices and creating gpusparsematrix indices
 */
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {
 inline std::vector<int>
 createReorderedToNatural(const Opm::SparseTable<size_t>& levelSets)
 {
-    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
+    auto res = std::vector<int>(Opm::gpuistl::detail::to_size_t(levelSets.dataSize()));
    int globCnt = 0;
    for (auto row : levelSets) {
        for (auto col : row) {
-            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
+            OPM_ERROR_IF(Opm::gpuistl::detail::to_size_t(globCnt) >= res.size(),
                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
            res[globCnt++] = static_cast<int>(col);
        }
@ -51,11 +51,11 @@ createReorderedToNatural(const Opm::SparseTable<size_t>& levelSets)
 inline std::vector<int>
 createNaturalToReordered(const Opm::SparseTable<size_t>& levelSets)
 {
-    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
+    auto res = std::vector<int>(Opm::gpuistl::detail::to_size_t(levelSets.dataSize()));
    int globCnt = 0;
    for (auto row : levelSets) {
        for (auto col : row) {
-            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
+            OPM_ERROR_IF(Opm::gpuistl::detail::to_size_t(globCnt) >= res.size(),
                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
            res[col] = globCnt++;
        }
@ -105,6 +105,6 @@ extractLowerAndUpperMatrices(const M& naturalMatrix, const std::vector<int>& reo
    return {std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedLower, true))),
            std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedUpper, true)))};
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp
@ -28,7 +28,7 @@



-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 #define CHECK_CUBLAS_ERROR_TYPE(code, x)                                                                               \
@ -108,7 +108,7 @@ getCublasErrorMessage(cublasStatus_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
 * #include <cublas_v2.h>
 *
 * void some_function() {
@ -147,7 +147,7 @@ cublasSafeCall(cublasStatus_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
 * #include <cublas_v2.h>
 *
 * void some_function() {
@ -174,7 +174,7 @@ cublasWarnIfError(cublasStatus_t error,

    return error;
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 /**
 * @brief OPM_CUBLAS_SAFE_CALL checks the return type of the cublas expression (function call) and throws an exception
@ -183,7 +183,7 @@ cublasWarnIfError(cublasStatus_t error,
 * Example usage:
 * @code{.cpp}
 * #include <cublas_v2.h>
- * #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
 *
 * void some_function() {
 *     cublasHandle_t cublasHandle;
@ -194,7 +194,7 @@ cublasWarnIfError(cublasStatus_t error,
 * @note This should be used for any call to cuBlas unless you have a good reason not to.
 */
 #define OPM_CUBLAS_SAFE_CALL(expression)                                                                               \
-    ::Opm::cuistl::detail::cublasSafeCall(expression, #expression, __FILE__, __func__, __LINE__)
+    ::Opm::gpuistl::detail::cublasSafeCall(expression, #expression, __FILE__, __func__, __LINE__)

 /**
 * @brief OPM_CUBLAS_WARN_IF_ERROR checks the return type of the cublas expression (function call) and issues a warning
@ -203,7 +203,7 @@ cublasWarnIfError(cublasStatus_t error,
 * Example usage:
 * @code{.cpp}
 * #include <cublas_v2.h>
- * #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
 *
 * void some_function() {
 *     cublasHandle_t cublasHandle;
@ -214,6 +214,6 @@ cublasWarnIfError(cublasStatus_t error,
 * @note Prefer the cublasSafeCall/OPM_CUBLAS_SAFE_CALL counterpart unless you really don't want to throw an exception.
 */
 #define OPM_CUBLAS_WARN_IF_ERROR(expression)                                                                           \
-    ::Opm::cuistl::detail::cublasWarnIfError(expression, #expression, __FILE__, __func__, __LINE__)
+    ::Opm::gpuistl::detail::cublasWarnIfError(expression, #expression, __FILE__, __func__, __LINE__)

 #endif // OPM_CUBLAS_SAFE_CALL_HPP
--- a/opm/simulators/linalg/gpuistl/detail/cublas_wrapper.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/cublas_wrapper.hpp
@ -29,7 +29,7 @@
 #include <cublas_v2.h>
 #include <opm/common/ErrorMacros.hpp>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 inline cublasStatus_t
@ -164,5 +164,5 @@ cublasNrm2([[maybe_unused]] cublasHandle_t handle,
    OPM_THROW(std::runtime_error, "norm2 for integer vectors is not implemented yet.");
 }

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/cuda_check_last_error.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/cuda_check_last_error.hpp
@ -20,7 +20,7 @@
 #define OPM_CUDA_CHECK_LAST_ERROR_HPP
 #include <cuda_runtime.h>
 #include <fmt/core.h>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

 /**
 * @brief OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE checks the return type of cudaDeviceSynchronize(),
@ -28,7 +28,7 @@
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cuda_check_last_error.hpp>
 *
 * void some_function() {
 *     OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE;
@ -38,7 +38,7 @@
 * @note This can be used to debug the code, or simply make sure that no error has occured.
 * @note This is a rather heavy operation, so prefer to use only in Debug mode (see OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE_IF_DEBUG)
 */
-#define OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize())
+#define OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE OPM_GPU_SAFE_CALL(cudaDeviceSynchronize())

 #ifdef NDEBUG
 #define OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE_IF_DEBUG
@ -50,7 +50,7 @@
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 *
 * void some_function() {
 *     OPM_CUDA_CHECK_DEVICE_SYNCHRONIZE_IF_DEBUG;
@ -69,7 +69,7 @@
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cuda_check_last_error.hpp>
 *
 * void some_function() {
 *     OPM_CUDA_CHECK_LAST_ERROR;
@ -78,7 +78,7 @@
 *
 * @note This can be used to debug the code, or simply make sure that no error has occured.
 */
-#define OPM_CUDA_CHECK_LAST_ERROR OPM_CUDA_SAFE_CALL(cudaGetLastError())
+#define OPM_CUDA_CHECK_LAST_ERROR OPM_GPU_SAFE_CALL(cudaGetLastError())

 #ifdef NDEBUG
 #define OPM_CUDA_CHECK_LAST_ERROR_IF_DEBUG
@ -90,7 +90,7 @@
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cuda_check_last_error.hpp>
 *
 * void some_function() {
 *     OPM_CUDA_CHECK_LAST_ERROR_IF_DEBUG;
--- a/opm/simulators/linalg/gpuistl/detail/cusparse_constants.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/cusparse_constants.hpp
@ -19,7 +19,7 @@
 #ifndef CUSPARSE_CONSTANTS_HPP
 #define CUSPARSE_CONSTANTS_HPP
 #include <cusparse.h>
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {
 const constexpr auto CUSPARSE_MATRIX_ORDER = CUSPARSE_DIRECTION_ROW;
 }
--- a/opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp
@ -24,7 +24,7 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/OpmLog/OpmLog.hpp>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 #define CHECK_CUSPARSE_ERROR_TYPE(code, x)                                                                             \
@ -93,7 +93,7 @@ getCusparseErrorMessage(cusparseStatus_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
 * #include <cublas_v2.h>
 *
 * void some_function() {
@ -133,7 +133,7 @@ cusparseSafeCall(cusparseStatus_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
 * #include <cublas_v2.h>
 *
 * void some_function() {
@ -161,7 +161,7 @@ cusparseWarnIfError(cusparseStatus_t error,

    return error;
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail



@ -171,7 +171,7 @@ cusparseWarnIfError(cusparseStatus_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
 * #include <cusparse.h>
 *
 * void some_function() {
@ -183,7 +183,7 @@ cusparseWarnIfError(cusparseStatus_t error,
 * @note This should be used for any call to cuSparse unless you have a good reason not to.
 */
 #define OPM_CUSPARSE_SAFE_CALL(expression)                                                                             \
-    ::Opm::cuistl::detail::cusparseSafeCall(expression, #expression, __FILE__, __func__, __LINE__)
+    ::Opm::gpuistl::detail::cusparseSafeCall(expression, #expression, __FILE__, __func__, __LINE__)

 /**
 * @brief OPM_CUSPARSE_WARN_IF_ERROR checks the return type of the cusparse expression (function call) and issues a
@ -191,7 +191,7 @@ cusparseWarnIfError(cusparseStatus_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>
 * #include <cusparse.h>
 *
 * void some_function() {
@ -204,5 +204,5 @@ cusparseWarnIfError(cusparseStatus_t error,
 * exception.
 */
 #define OPM_CUSPARSE_WARN_IF_ERROR(expression)                                                                         \
-    ::Opm::cuistl::detail::cusparseWarnIfError(expression, #expression, __FILE__, __func__, __LINE__)
+    ::Opm::gpuistl::detail::cusparseWarnIfError(expression, #expression, __FILE__, __func__, __LINE__)
 #endif // OPM_CUSPARSE_SAFE_CALL_HPP
--- a/opm/simulators/linalg/gpuistl/detail/cusparse_wrapper.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/cusparse_wrapper.hpp
@ -27,7 +27,7 @@
 #include <type_traits>
 #ifndef OPM_CUSPARSE_WRAPPER_HPP
 #define OPM_CUSPARSE_WRAPPER_HPP
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 inline cusparseStatus_t
@ -450,5 +450,5 @@ cusparseBsrmv(cusparseHandle_t handle,
                          beta,
                          y);
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
--- a/opm/simulators/linalg/gpuistl/detail/fix_zero_diagonal.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/fix_zero_diagonal.hpp
@ -22,7 +22,7 @@
 #include <limits>
 #include <vector>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -53,6 +53,6 @@ makeMatrixWithNonzeroDiagonal(const Matrix& matrix,

    return newMatrix;
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp
@ -21,12 +21,12 @@
 #include <cstddef>
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

 /*
    This file provides some logic for handling how to choose the correct thread-block size
 */
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {
 constexpr inline size_t
 getThreads([[maybe_unused]] size_t numberOfRows)
@ -51,7 +51,7 @@ getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize = -1)
    }
    int blockSize;
    int tmpGridSize;
-    OPM_CUDA_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0));
+    OPM_GPU_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0));
    return blockSize;
 }

@ -61,6 +61,6 @@ getNumberOfBlocks(int wantedThreads, int threadBlockSize)
    return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
 }

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp
@ -16,15 +16,15 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUDA_SAFE_CALL_HPP
-#define OPM_CUDA_SAFE_CALL_HPP
+#ifndef OPM_GPU_SAFE_CALL_HPP
+#define OPM_GPU_SAFE_CALL_HPP
 #include <cuda_runtime.h>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <string_view>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {
 /**
 * @brief getCudaErrorMessage generates the error message to display for a given error.
@ -48,9 +48,9 @@ getCudaErrorMessage(cudaError_t error,
                    const std::string_view& functionName,
                    size_t lineNumber)
 {
-    return fmt::format("CUDA expression did not execute correctly. Expression was: \n"
+    return fmt::format("GPU expression did not execute correctly. Expression was: \n"
                       "    {}\n"
-                       "CUDA error was {}\n"
+                       "GPU error was {}\n"
                       "in function {}, in {}, at line {}\n",
                       expression,
                       cudaGetErrorString(error),
@ -60,12 +60,12 @@ getCudaErrorMessage(cudaError_t error,
 }

 /**
- * @brief cudaSafeCall checks the return type of the CUDA expression (function call) and throws an exception if it
+ * @brief cudaSafeCall checks the return type of the GPU expression (function call) and throws an exception if it
 * does not equal cudaSuccess.
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 * #include <cuda_runtime.h>
 *
 * void some_function() {
@ -74,7 +74,7 @@ getCudaErrorMessage(cudaError_t error,
 * }
 * @endcode
 *
- * @note It is probably easier to use the macro OPM_CUDA_SAFE_CALL
+ * @note It is probably easier to use the macro OPM_GPU_SAFE_CALL
 *
 * @todo Refactor to use std::source_location once we shift to C++20
 */
@ -91,7 +91,7 @@ cudaSafeCall(cudaError_t error,
 }

 /**
- * @brief cudaWarnIfError checks the return type of the CUDA expression (function call) and issues a warning if it
+ * @brief cudaWarnIfError checks the return type of the GPU expression (function call) and issues a warning if it
 * does not equal cudaSuccess.
 *
 * @param error the error code from cublas
@ -102,7 +102,7 @@ cudaSafeCall(cudaError_t error,
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 * #include <cuda_runtime.h>
 *
 * void some_function() {
@ -111,9 +111,9 @@ cudaSafeCall(cudaError_t error,
 * }
 * @endcode
 *
- * @note It is probably easier to use the macro OPM_CUDA_WARN_IF_ERROR
+ * @note It is probably easier to use the macro OPM_GPU_WARN_IF_ERROR
 *
- * @note Prefer the cudaSafeCall/OPM_CUDA_SAFE_CALL counterpart unless you really don't want to throw an exception.
+ * @note Prefer the cudaSafeCall/OPM_GPU_SAFE_CALL counterpart unless you really don't want to throw an exception.
 *
 * @todo Refactor to use std::source_location once we shift to C++20
 */
@ -128,47 +128,47 @@ cudaWarnIfError(cudaError_t error,
        OpmLog::warning(getCudaErrorMessage(error, expression, filename, functionName, lineNumber));
    }
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 /**
- * @brief OPM_CUDA_SAFE_CALL checks the return type of the CUDA expression (function call) and throws an exception if it
+ * @brief OPM_GPU_SAFE_CALL checks the return type of the GPU expression (function call) and throws an exception if it
 * does not equal cudaSuccess.
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 * #include <cuda_runtime.h>
 *
 * void some_function() {
 *     void* somePointer;
- *     OPM_CUDA_SAFE_CALL(cudaMalloc(&somePointer, 1));
+ *     OPM_GPU_SAFE_CALL(cudaMalloc(&somePointer, 1));
 * }
 * @endcode
 *
- * @note This should be used for any call to the CUDA runtime API unless you have a good reason not to.
+ * @note This should be used for any call to the GPU runtime API unless you have a good reason not to.
 */
-#define OPM_CUDA_SAFE_CALL(expression)                                                                                 \
-    ::Opm::cuistl::detail::cudaSafeCall(expression, #expression, __FILE__, __func__, __LINE__)
+#define OPM_GPU_SAFE_CALL(expression)                                                                                 \
+    ::Opm::gpuistl::detail::cudaSafeCall(expression, #expression, __FILE__, __func__, __LINE__)


 /**
- * @brief OPM_CUDA_WARN_IF_ERROR checks the return type of the CUDA expression (function call) and issues a warning if
+ * @brief OPM_GPU_WARN_IF_ERROR checks the return type of the GPU expression (function call) and issues a warning if
 * it does not equal cudaSuccess.
 *
 * Example usage:
 * @code{.cpp}
- * #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+ * #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 * #include <cuda_runtime.h>
 *
 * void some_function() {
 *     void* somePointer;
- *     OPM_CUDA_WARN_IF_ERROR(cudaMalloc(&somePointer, 1));
+ *     OPM_GPU_WARN_IF_ERROR(cudaMalloc(&somePointer, 1));
 * }
 * @endcode
 *
- * @note Prefer the cudaSafeCall/OPM_CUDA_SAFE_CALL counterpart unless you really don't want to throw an exception.
+ * @note Prefer the cudaSafeCall/OPM_GPU_SAFE_CALL counterpart unless you really don't want to throw an exception.
 */
-#define OPM_CUDA_WARN_IF_ERROR(expression)                                                                             \
-    ::Opm::cuistl::detail::cudaWarnIfError(expression, #expression, __FILE__, __func__, __LINE__)
+#define OPM_GPU_WARN_IF_ERROR(expression)                                                                             \
+    ::Opm::gpuistl::detail::cudaWarnIfError(expression, #expression, __FILE__, __func__, __LINE__)

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.cu
@ -18,12 +18,12 @@
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp>
 #include <stdexcept>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {
 namespace
 {
@ -110,8 +110,8 @@ copyMatDataToReordered(T* srcMatrix,
                       int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
    cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
 }
@ -130,9 +130,9 @@ copyMatDataToReorderedSplit(T* srcMatrix,
                            size_t numberOfRows,
                            int thrBlockSize)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
                                                                                 srcRowIndices,
                                                                                 srcColumnIndices,
@ -161,4 +161,4 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
--- a/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp
@ -16,11 +16,11 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#ifndef OPM_CUISTL_CUSPARSE_MATRIX_OPERATIONS_HPP
-#define OPM_CUISTL_CUSPARSE_MATRIX_OPERATIONS_HPP
+#ifndef OPM_GPUISTL_GPUSPARSE_MATRIX_OPERATIONS_HPP
+#define OPM_GPUISTL_GPUSPARSE_MATRIX_OPERATIONS_HPP
 #include <cstddef>
 #include <vector>
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {
 /**
 * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
@ -68,5 +68,5 @@ void copyMatDataToReorderedSplit(T* srcMatrix,
                                 size_t numberOfRows,
                                 int threadBlockSize);

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/has_function.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/has_function.hpp
@ -29,7 +29,7 @@
 * TODO: Use the requires-keyword once C++20 becomes availble (https://en.cppreference.com/w/cpp/language/constraints ).
 * With C++20 this file can be removed.
 */
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -121,5 +121,5 @@ public:
    static constexpr bool value = std::is_same_v<decltype(test<T>(0)), std::true_type>;
 };

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@ -18,12 +18,12 @@
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp>
 #include <stdexcept>

-namespace Opm::cuistl::detail::DILU
+namespace Opm::gpuistl::detail::DILU
 {
 namespace
 {
@ -282,8 +282,8 @@ solveLowerLevelSet(T* reorderedMat,
                   int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
@ -302,9 +302,9 @@ solveLowerLevelSetSplit(T* reorderedMat,
                        T* v,
                        int thrBlockSize)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
@ -322,8 +322,8 @@ solveUpperLevelSet(T* reorderedMat,
                   int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
@ -340,9 +340,9 @@ solveUpperLevelSetSplit(T* reorderedMat,
                        T* v,
                        int thrBlockSize)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
@ -360,9 +360,9 @@ computeDiluDiagonal(T* reorderedMat,
                    int thrBlockSize)
 {
    if (blocksize <= 3) {
-        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
            cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
-        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
                                                                                rowIndices,
                                                                                colIndices,
@ -393,9 +393,9 @@ computeDiluDiagonalSplit(T* reorderedLowerMat,
                         int thrBlockSize)
 {
    if (blocksize <= 3) {
-        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
            cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
-        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
                                                                                     lowerRowIndices,
                                                                                     lowerColIndices,
@ -434,4 +434,4 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
-} // namespace Opm::cuistl::detail::DILU
+} // namespace Opm::gpuistl::detail::DILU
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -24,7 +24,7 @@
 #include <cuda_runtime.h>
 #include <vector>

-namespace Opm::cuistl::detail::DILU
+namespace Opm::gpuistl::detail::DILU
 {

 /**
@ -198,5 +198,5 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
                              T* dInv,
                              int threadBlockSize);

-} // namespace Opm::cuistl::detail::DILU
+} // namespace Opm::gpuistl::detail::DILU
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@ -18,16 +18,16 @@
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
 #include <stdexcept>

 /*
    The LU factorization and apply step is written based on the Dune implementations
 */

-namespace Opm::cuistl::detail::ILU0
+namespace Opm::gpuistl::detail::ILU0
 {
 namespace
 {
@ -341,8 +341,8 @@ solveLowerLevelSet(T* reorderedMat,
                   int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
@ -359,8 +359,8 @@ solveUpperLevelSet(T* reorderedMat,
                   int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }
@ -377,9 +377,9 @@ solveLowerLevelSetSplit(T* reorderedMat,
                        T* v,
                        int thrBlockSize)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
@ -396,9 +396,9 @@ solveUpperLevelSetSplit(T* reorderedMat,
                        T* v,
                        int thrBlockSize)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
@ -415,8 +415,8 @@ LUFactorization(T* srcMatrix,
                int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }
@ -437,8 +437,8 @@ LUFactorizationSplit(T* reorderedLowerMat,
                     int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
                                                                             lowerRowIndices,
                                                                             lowerColIndices,
@ -473,4 +473,4 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
-} // namespace Opm::cuistl::detail::ILU0
+} // namespace Opm::gpuistl::detail::ILU0
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@ -20,7 +20,7 @@
 #define OPM_ILU0_KERNELS_HPP
 #include <cstddef>
 #include <vector>
-namespace Opm::cuistl::detail::ILU0
+namespace Opm::gpuistl::detail::ILU0
 {

 /**
@ -189,5 +189,5 @@ void LUFactorizationSplit(T* reorderedLowerMat,
                          int rowsInLevelSet,
                          int threadBlockSize);

-} // namespace Opm::cuistl::detail::ILU0
+} // namespace Opm::gpuistl::detail::ILU0
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.cu
@ -18,12 +18,12 @@
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.hpp>
 #include <stdexcept>

-namespace Opm::cuistl::detail::JAC
+namespace Opm::gpuistl::detail::JAC
 {
 namespace
 {
@ -60,8 +60,8 @@ invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t number
 {
    if (blocksize <= 3) {
        int threadBlockSize
-            = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
-        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
+            = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
+        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
        cuInvertDiagonalAndFlatten<T, blocksize>
            <<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
    } else {
@ -85,4 +85,4 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);

-} // namespace Opm::cuistl::detail::JAC
+} // namespace Opm::gpuistl::detail::JAC
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.hpp
@ -20,7 +20,7 @@
 #define OPM_JAC_KERNELS_HPP
 #include <cstddef>
 #include <vector>
-namespace Opm::cuistl::detail::JAC
+namespace Opm::gpuistl::detail::JAC
 {

 /**
@ -34,5 +34,5 @@ namespace Opm::cuistl::detail::JAC
 template <class T, int blocksize>
 void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);

-} // namespace Opm::cuistl::detail::JAC
+} // namespace Opm::gpuistl::detail::JAC
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/preconditioner_should_call_post_pre.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditioner_should_call_post_pre.hpp
@ -20,9 +20,9 @@
 #ifndef OPM_CUISTL_PRECONDIDTIONER_SHOULD_CALL_POST_PRE_HPP
 #define OPM_CUISTL_PRECONDIDTIONER_SHOULD_CALL_POST_PRE_HPP

-#include <opm/simulators/linalg/cuistl/detail/has_function.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/has_function.hpp>

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 //! @brief Tests (compile time) if the preconditioner type needs to call pre() before a call to apply()
@ -60,5 +60,5 @@ shouldCallPreconditionerPost()
        return true;
    }
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif
--- a/opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp
@ -37,7 +37,7 @@
 * while Dune::BlockVector (and relatives) use unsigned size_t.
 */

-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -106,6 +106,6 @@ to_size_t(int i)

    return std::size_t(i);
 }
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail

 #endif
--- a/opm/simulators/linalg/gpuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/gpuistl/detail/vector_operations.cu
@ -18,14 +18,14 @@
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cublas_wrapper.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/vector_operations.hpp>
 #include <stdexcept>
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 namespace
@ -108,8 +108,8 @@ template <class T>
 void
 setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
    setVectorValueKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, value);
 }

@ -121,8 +121,8 @@ template <class T>
 void
 setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
    setZeroAtIndexSetKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, indices);
 }
 template void setZeroAtIndexSet(double*, size_t, const int*);
@ -138,12 +138,12 @@ innerProductAtIndices(cublasHandle_t cublasHandle,
                      size_t numberOfElements,
                      const int* indices)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
    elementWiseMultiplyKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, deviceB, buffer, numberOfElements, indices);

    // TODO: [perf] Get rid of the allocation here.
-    CuVector<T> oneVector(numberOfElements);
+    GpuVector<T> oneVector(numberOfElements);
    oneVector = 1.0;
    T result = 0.0;
    OPM_CUBLAS_SAFE_CALL(cublasDot(cublasHandle, numberOfElements, oneVector.data(), 1, buffer, 1, &result));
@ -158,10 +158,10 @@ template <class T>
 void
 prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
    prepareSendBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
-    OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
+    OPM_GPU_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
 }
 template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
 template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberOfElements, const int* indices);
@ -171,8 +171,8 @@ template <class T>
 void
 syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
-    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
+    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
    syncFromRecvBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
    // cudaDeviceSynchronize(); // Not needed, I guess...
 }
@ -191,20 +191,20 @@ weightedDiagMV(const T* squareBlockVector,
 {
    switch (blocksize) {
    case 1: {
-        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
-        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
+        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
        weightedDiagMV<T, 1>
            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
    } break;
    case 2: {
-        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
-        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
+        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
        weightedDiagMV<T, 2>
            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
    } break;
    case 3: {
-        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
-        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
+        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
        weightedDiagMV<T, 3>
            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
    } break;
@ -217,4 +217,4 @@ weightedDiagMV(const T* squareBlockVector,
 template void weightedDiagMV(const double*, const size_t, const size_t, double, const double*, double*);
 template void weightedDiagMV(const float*, const size_t, const size_t, float, const float*, float*);

-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
--- a/opm/simulators/linalg/gpuistl/detail/vector_operations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/vector_operations.hpp
@ -20,7 +20,7 @@
 #define OPM_CUISTL_VECTOR_OPERATIONS_HPP
 #include <cstddef>
 #include <cublas_v2.h>
-namespace Opm::cuistl::detail
+namespace Opm::gpuistl::detail
 {

 /**
@ -65,11 +65,11 @@ void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int*
 /**
 * @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
 * compute the Hadamard product.
- * @param squareBlockVector A CuVector whose elements are NxN matrix blocks
+ * @param squareBlockVector A GpuVector whose elements are NxN matrix blocks
 * @param numberOfRows The number of rows in the vector
 * @param blocksize The sidelength of the square block elements in the vector
- * @param src_vec A pointer to the data of the CuVector we multiply the blockvector with
- * @param[out] dst_vec A pointer to the data of the CuVector we store the result in
+ * @param src_vec A pointer to the data of the GpuVector we multiply the blockvector with
+ * @param[out] dst_vec A pointer to the data of the GpuVector we store the result in
 *
 * @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the
 * diagonal of the matrix and use this product.
@ -81,5 +81,5 @@ void weightedDiagMV(const T* squareBlockVector,
                    T relaxationFactor,
                    const T* srcVec,
                    T* dstVec);
-} // namespace Opm::cuistl::detail
+} // namespace Opm::gpuistl::detail
 #endif
--- a/opm/simulators/linalg/gpuistl/set_device.cpp
+++ b/opm/simulators/linalg/gpuistl/set_device.cpp
@ -19,9 +19,9 @@
 #include <config.h>
 #include <cuda_runtime.h>
 #include <opm/common/OpmLog/OpmLog.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/set_device.hpp>
-namespace Opm::cuistl
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/set_device.hpp>
+namespace Opm::gpuistl
 {
 void
 setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
@ -41,9 +41,9 @@ setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
    // Now do a round robin kind of assignment
    // TODO: We need to be more sophistacted here. We have no guarantee this will pick the correct device.
    const auto deviceId = mpiRank % deviceCount;
-    OPM_CUDA_SAFE_CALL(cudaDeviceReset());
-    OPM_CUDA_SAFE_CALL(cudaSetDevice(deviceId));
+    OPM_GPU_SAFE_CALL(cudaDeviceReset());
+    OPM_GPU_SAFE_CALL(cudaSetDevice(deviceId));
    OpmLog::info("Set CUDA device to " + std::to_string(deviceId) + " (out of " + std::to_string(deviceCount)
                 + " devices).");
 }
-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/set_device.hpp
+++ b/opm/simulators/linalg/gpuistl/set_device.hpp
@ -20,7 +20,7 @@
 #ifndef OPM_CUISTL_SET_DEVICE_HEADER
 #define OPM_CUISTL_SET_DEVICE_HEADER

-namespace Opm::cuistl
+namespace Opm::gpuistl
 {
 //! @brief Sets the correct CUDA device in the setting of MPI
 //!
@ -32,5 +32,5 @@ namespace Opm::cuistl
 //!
 //! @note If no CUDA device is present, this does nothing.
 void setDevice(int mpiRank, int numberOfMpiRanks);
-} // namespace Opm::cuistl
+} // namespace Opm::gpuistl
 #endif
--- a/tests/gpuistl/test_GpuBuffer.cu
+++ b/tests/gpuistl/test_GpuBuffer.cu
@ -18,14 +18,14 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuBuffer
+#define BOOST_TEST_MODULE TestGpuBuffer

 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>

-#include <opm/simulators/linalg/cuistl/CuBuffer.hpp>
-#include <opm/simulators/linalg/cuistl/CuView.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuBuffer.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

 #include <array>
 #include <algorithm>
@ -35,15 +35,15 @@ BOOST_AUTO_TEST_CASE(TestMakeView)
 {
    // test that we can create buffers and make views of the buffers using the pointer constructor
    auto buf = std::vector<int>({1, 2, 3, 4, 5, 6});
-    const auto gpubuf = ::Opm::cuistl::CuBuffer<int>(buf);
-    auto gpuview = ::Opm::cuistl::CuView<int>(buf.data(), buf.size());
-    bool gpuBufCreatedView = std::is_same<::Opm::cuistl::CuView<int>, decltype(gpuview)>::value;
+    const auto gpubuf = ::Opm::gpuistl::GpuBuffer<int>(buf);
+    auto gpuview = ::Opm::gpuistl::GpuView<int>(buf.data(), buf.size());
+    bool gpuBufCreatedView = std::is_same<::Opm::gpuistl::GpuView<int>, decltype(gpuview)>::value;

    BOOST_CHECK(gpuBufCreatedView);

-    // test that we can make views of buffers by using the cubuffer constructor
-    auto gpuview2 = ::Opm::cuistl::make_view(gpubuf);
-    bool gpuBufCreatedView2 = std::is_same<::Opm::cuistl::CuView<const int>, decltype(gpuview2)>::value;
+    // test that we can make views of buffers by using the GpuBuffer constructor
+    auto gpuview2 = ::Opm::gpuistl::make_view(gpubuf);
+    bool gpuBufCreatedView2 = std::is_same<::Opm::gpuistl::GpuView<const int>, decltype(gpuview2)>::value;

    BOOST_CHECK(gpuBufCreatedView2);

--- a/tests/gpuistl/test_GpuDILU.cpp
+++ b/tests/gpuistl/test_GpuDILU.cpp
@ -18,18 +18,18 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuDiluHelpers
+#define BOOST_TEST_MODULE TestGpuDILU

 #include <boost/test/unit_test.hpp>
 #include <dune/common/fmatrix.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <memory>
 #include <opm/simulators/linalg/DILU.hpp>
-#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuDILU.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
 #include <random>
 #include <vector>

@ -41,11 +41,11 @@ using B1x1Vec = Dune::BlockVector<Dune::FieldVector<double, 1>>;
 using B2x2Vec = Dune::BlockVector<Dune::FieldVector<double, 2>>;
 using Sp1x1BlockMatrix = Dune::BCRSMatrix<FM1x1>;
 using Sp2x2BlockMatrix = Dune::BCRSMatrix<FM2x2>;
-using CuMatrix = Opm::cuistl::CuSparseMatrix<T>;
-using CuIntVec = Opm::cuistl::CuVector<int>;
-using CuFloatingPointVec = Opm::cuistl::CuVector<T>;
-using CuDilu1x1 = Opm::cuistl::CuDILU<Sp1x1BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
-using CuDilu2x2 = Opm::cuistl::CuDILU<Sp2x2BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
+using CuMatrix = Opm::gpuistl::GpuSparseMatrix<T>;
+using CuIntVec = Opm::gpuistl::GpuVector<int>;
+using CuFloatingPointVec = Opm::gpuistl::GpuVector<T>;
+using GpuDilu1x1 = Opm::gpuistl::GpuDILU<Sp1x1BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
+using GpuDilu2x2 = Opm::gpuistl::GpuDILU<Sp2x2BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;

 Sp1x1BlockMatrix
 get1x1BlockTestMatrix()
@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApply)

    // Initialize preconditioner objects
    Dune::MultithreadDILU<Sp1x1BlockMatrix, B1x1Vec, B1x1Vec> cpudilu(matA);
-    auto gpudilu = CuDilu1x1(matA, true, true);
+    auto gpudilu = GpuDilu1x1(matA, true, true);

    // Use the apply
    gpudilu.apply(d_output, d_input);
@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApply)
    }
    auto cudilures = d_output.asStdVector();

-    // check that CuDilu results matches that of CPU dilu
+    // check that GpuDilu results matches that of CPU dilu
    for (size_t i = 0; i < cudilures.size(); ++i) {
        BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
    }
@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApplyBlocked)

    // init matrix with 2x2 blocks
    Sp2x2BlockMatrix matA = get2x2BlockTestMatrix();
-    auto gpudilu = CuDilu2x2(matA, true, true);
+    auto gpudilu = GpuDilu2x2(matA, true, true);
    Dune::MultithreadDILU<Sp2x2BlockMatrix, B2x2Vec, B2x2Vec> cpudilu(matA);

    // create input/output buffers for the apply
@ -275,7 +275,7 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge)
 {
    // create gpu dilu preconditioner
    Sp1x1BlockMatrix matA = get1x1BlockTestMatrix();
-    auto gpudilu = CuDilu1x1(matA, true, true);
+    auto gpudilu = GpuDilu1x1(matA, true, true);

    matA[0][0][0][0] = 11.0;
    matA[0][1][0][0] = 12.0;
@ -325,7 +325,7 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge)
    }
    auto cudilures = d_output.asStdVector();

-    // check that CuDilu results matches that of CPU dilu
+    // check that GpuDilu results matches that of CPU dilu
    for (size_t i = 0; i < cudilures.size(); ++i) {
        BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
    }
--- a/tests/gpuistl/test_GpuJac.cpp
+++ b/tests/gpuistl/test_GpuJac.cpp
@ -18,22 +18,22 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuJac
+#define BOOST_TEST_MODULE TestGpuJac
 #include <boost/mpl/list.hpp>
 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/istl/bcrsmatrix.hh>
-#include <opm/simulators/linalg/cuistl/CuJac.hpp>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuJac.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/vector_operations.hpp>

 using NumericTypes = boost::mpl::list<double, float>;

-BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
+BOOST_AUTO_TEST_CASE_TEMPLATE(GPUJACApplyBlocksize2, T, NumericTypes)
 {
    /*
       Test data to validate jacobi preconditioner, expected result is x_1, and relaxation factor is 0.5
@ -49,7 +49,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
-    using CuJac = Opm::cuistl::CuJac<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+    using GpuJac = Opm::gpuistl::GpuJac<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
@ -70,7 +70,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
    B[1][1][0][0] = -1.0;
    B[1][1][1][1] = -1.0;

-    auto cujac = Opm::cuistl::PreconditionerAdapter<Vector, Vector, CuJac>(std::make_shared<CuJac>(B, 0.5));
+    auto gpujac = Opm::gpuistl::PreconditionerAdapter<Vector, Vector, GpuJac>(std::make_shared<GpuJac>(B, 0.5));

    Vector vVector(2);
    Vector dVector(2);
@ -81,14 +81,14 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)

    const T expectedAns[2][2] = {{1.0 / 2.0, -1.0 / 2.0}, {-3.0 / 2.0, -2.0}};

-    cujac.apply(vVector, dVector);
+    gpujac.apply(vVector, dVector);
    BOOST_CHECK_CLOSE(vVector[0][0], expectedAns[0][0], 1e-7);
    BOOST_CHECK_CLOSE(vVector[0][1], expectedAns[0][1], 1e-7);
    BOOST_CHECK_CLOSE(vVector[1][0], expectedAns[1][0], 1e-7);
    BOOST_CHECK_CLOSE(vVector[1][1], expectedAns[1][1], 1e-7);
 }

-BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize1, T, NumericTypes)
+BOOST_AUTO_TEST_CASE_TEMPLATE(GPUJACApplyBlocksize1, T, NumericTypes)
 {
    /*
       Test data to validate jacobi preconditioner, expected result is x_1, relaxation factor is 0.5
@ -103,7 +103,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize1, T, NumericTypes)
    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
-    using CuJac = Opm::cuistl::CuJac<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+    using GpuJac = Opm::gpuistl::GpuJac<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
@ -129,7 +129,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize1, T, NumericTypes)
    B[2][2][0][0] = -1.0;
    B[3][3][0][0] = -1.0;

-    auto cujac = Opm::cuistl::PreconditionerAdapter<Vector, Vector, CuJac>(std::make_shared<CuJac>(B, 0.5));
+    auto gpujac = Opm::gpuistl::PreconditionerAdapter<Vector, Vector, GpuJac>(std::make_shared<GpuJac>(B, 0.5));

    Vector vVector(4);
    Vector dVector(4);
@ -140,7 +140,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize1, T, NumericTypes)

    const T expectedAns[4] = {1.0 / 3.0, 1.0 / 2.0, -3.0 / 2.0, -2.0};

-    cujac.apply(vVector, dVector);
+    gpujac.apply(vVector, dVector);
    BOOST_CHECK_CLOSE(vVector[0], expectedAns[0], 1e-7);
    BOOST_CHECK_CLOSE(vVector[1], expectedAns[1], 1e-7);
    BOOST_CHECK_CLOSE(vVector[2], expectedAns[2], 1e-7);
--- a/tests/gpuistl/test_GpuOwnerOverlapCopy.cpp
+++ b/tests/gpuistl/test_GpuOwnerOverlapCopy.cpp
@ -18,7 +18,7 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuOwnerOverlapCopy
+#define BOOST_TEST_MODULE TestGpuOwnerOverlapCopy
 #define BOOST_TEST_NO_MAIN

 #include <boost/test/unit_test.hpp>
@ -26,10 +26,10 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/owneroverlapcopy.hh>
 #include <memory>
-#include <opm/simulators/linalg/cuistl/CuOwnerOverlapCopy.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/set_device.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuOwnerOverlapCopy.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/set_device.hpp>
 #include <random>
 #include <mpi.h>

@ -46,7 +46,7 @@ main(int argc, char** argv)
    int rank, totalRanks;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &totalRanks);
-    Opm::cuistl::setDevice(rank, totalRanks);
+    Opm::gpuistl::setDevice(rank, totalRanks);
    boost::unit_test::unit_test_main(&init_unit_test_func, argc, argv);
 }

@ -62,14 +62,14 @@ BOOST_AUTO_TEST_CASE(TestProject)

    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
-    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
+    auto xGPU = Opm::gpuistl::GpuVector<double>(xCPU);

-    auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
+    auto gpuComm = std::make_shared<Opm::gpuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
    
-    auto cuOwnerOverlapCopy
-        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);
+    auto GpuOwnerOverlapCopy
+        = Opm::gpuistl::GpuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);

-    cuOwnerOverlapCopy.project(xGPU);
+    GpuOwnerOverlapCopy.project(xGPU);

    auto resultOfProject = xGPU.asStdVector();

@ -94,19 +94,19 @@ BOOST_AUTO_TEST_CASE(TestDot)
    indexInfo.addRemoteIndex(std::make_tuple(0, 2, Dune::OwnerOverlapCopyAttributeSet::copy));
    auto ownerOverlapCopy = Dune::OwnerOverlapCopyCommunication<int>(indexInfo, MPI_COMM_WORLD);
    auto xCPU = std::vector<double> {{1.0, 2.0, 3.0}};
-    auto xGPU = Opm::cuistl::CuVector<double>(xCPU);
+    auto xGPU = Opm::gpuistl::GpuVector<double>(xCPU);

-    auto gpuComm = std::make_shared<Opm::cuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);
+    auto gpuComm = std::make_shared<Opm::gpuistl::GPUObliviousMPISender<double, 1, Dune::OwnerOverlapCopyCommunication<int>>>(ownerOverlapCopy);

-    auto cuOwnerOverlapCopy
-        = Opm::cuistl::CuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);
+    auto GpuOwnerOverlapCopy
+        = Opm::gpuistl::GpuOwnerOverlapCopy<double, 1, Dune::OwnerOverlapCopyCommunication<int>>(gpuComm);

    double outputDune = -1.0;
    auto xDune = xGPU.asDuneBlockVector<1>();
    ownerOverlapCopy.dot(xDune, xDune, outputDune);

    double output = -1.0;
-    cuOwnerOverlapCopy.dot(xGPU, xGPU, output);
+    GpuOwnerOverlapCopy.dot(xGPU, xGPU, output);


    BOOST_CHECK_EQUAL(outputDune, output);
--- a/tests/gpuistl/test_GpuSeqILU0.cpp
+++ b/tests/gpuistl/test_GpuSeqILU0.cpp
@ -18,7 +18,7 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuSeqILU0
+#define BOOST_TEST_MODULE TestGpuSeqILU0
 #define BOOST_TEST_NO_MAIN


@ -27,10 +27,10 @@
 #include <dune/common/parallel/mpihelper.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/preconditioners.hh>
-#include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSeqILU0.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

 #include <limits>
 #include <memory>
@ -63,7 +63,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifference1D, T, NumericTypes)
    using M = Dune::FieldMatrix<T, 1, 1>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, 1>>;
-    using CuILU0 = Opm::cuistl::CuSeqILU0<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+    using GpuILU0 = Opm::gpuistl::GpuSeqILU0<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
@ -91,7 +91,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifference1D, T, NumericTypes)

    auto duneILU = Dune::SeqILU<SpMatrix, Vector, Vector>(B, 1.0);

-    auto cuILU = Opm::cuistl::PreconditionerAdapter<Vector, Vector, CuILU0>(std::make_shared<CuILU0>(B, 1.0));
+    auto gpuILU = Opm::gpuistl::PreconditionerAdapter<Vector, Vector, GpuILU0>(std::make_shared<GpuILU0>(B, 1.0));

    // check for the standard basis {e_i}
    // (e_i=(0,...,0, 1 (i-th place), 0, ..., 0))
@ -101,7 +101,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifference1D, T, NumericTypes)
        Vector outputVectorDune(N);
        Vector outputVectorCuistl(N);
        duneILU.apply(outputVectorDune, inputVector);
-        cuILU.apply(outputVectorCuistl, inputVector);
+        gpuILU.apply(outputVectorCuistl, inputVector);

        for (int component = 0; component < N; ++component) {
            BOOST_CHECK_CLOSE(outputVectorDune[component][0],
@ -113,7 +113,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifference1D, T, NumericTypes)
    // Now we check that we can update the matrix. We basically just negate B
    B *= -1.0;
    auto duneILUNew = Dune::SeqILU<SpMatrix, Vector, Vector>(B, 1.0);
-    cuILU.update();
+    gpuILU.update();
    // check for the standard basis {e_i}
    // (e_i=(0,...,0, 1 (i-th place), 0, ..., 0))
    for (int i = 0; i < N; ++i) {
@ -122,7 +122,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifference1D, T, NumericTypes)
        Vector outputVectorDune(N);
        Vector outputVectorCuistl(N);
        duneILUNew.apply(outputVectorDune, inputVector);
-        cuILU.apply(outputVectorCuistl, inputVector);
+        gpuILU.apply(outputVectorCuistl, inputVector);

        for (int component = 0; component < N; ++component) {
            BOOST_CHECK_CLOSE(outputVectorDune[component][0],
@ -158,7 +158,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifferenceBlock2, T, NumericTypes)
    using M = Dune::FieldMatrix<T, 2, 2>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, 2>>;
-    using CuILU0 = Opm::cuistl::CuSeqILU0<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+    using GpuILU0 = Opm::gpuistl::GpuSeqILU0<SpMatrix, Opm::gpuistl::GpuVector<T>, Opm::gpuistl::GpuVector<T>>;

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
@ -181,7 +181,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifferenceBlock2, T, NumericTypes)

    auto duneILU = Dune::SeqILU<SpMatrix, Vector, Vector>(B, 1.0);

-    auto cuILU = Opm::cuistl::PreconditionerAdapter<Vector, Vector, CuILU0>(std::make_shared<CuILU0>(B, 1.0));
+    auto gpuILU = Opm::gpuistl::PreconditionerAdapter<Vector, Vector, GpuILU0>(std::make_shared<GpuILU0>(B, 1.0));

    // check for the standard basis {e_i}
    // (e_i=(0,...,0, 1 (i-th place), 0, ..., 0))
@ -191,7 +191,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifferenceBlock2, T, NumericTypes)
        Vector outputVectorDune(N);
        Vector outputVectorCuistl(N);
        duneILU.apply(outputVectorDune, inputVector);
-        cuILU.apply(outputVectorCuistl, inputVector);
+        gpuILU.apply(outputVectorCuistl, inputVector);

        for (int component = 0; component < N; ++component) {
            BOOST_CHECK_CLOSE(outputVectorDune[component][0],
@ -203,7 +203,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifferenceBlock2, T, NumericTypes)
    // Now we check that we can update the matrix. We basically just negate B
    B *= -1.0;
    auto duneILUNew = Dune::SeqILU<SpMatrix, Vector, Vector>(B, 1.0);
-    cuILU.update();
+    gpuILU.update();
    // check for the standard basis {e_i}
    // (e_i=(0,...,0, 1 (i-th place), 0, ..., 0))
    for (int i = 0; i < N; ++i) {
@ -212,7 +212,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(TestFiniteDifferenceBlock2, T, NumericTypes)
        Vector outputVectorDune(N);
        Vector outputVectorCuistl(N);
        duneILUNew.apply(outputVectorDune, inputVector);
-        cuILU.apply(outputVectorCuistl, inputVector);
+        gpuILU.apply(outputVectorCuistl, inputVector);

        for (int component = 0; component < N; ++component) {
            BOOST_CHECK_CLOSE(outputVectorDune[component][0],
--- a/tests/gpuistl/test_GpuSparseMatrix.cpp
+++ b/tests/gpuistl/test_GpuSparseMatrix.cpp
@ -18,14 +18,14 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuSparseMatrix
+#define BOOST_TEST_MODULE TestGpuSparseMatrix

 #include <boost/test/unit_test.hpp>
 #include <dune/istl/bcrsmatrix.hh>
 #include <memory>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 #include <random>

 BOOST_AUTO_TEST_CASE(TestConstruction1D)
@ -76,17 +76,17 @@ BOOST_AUTO_TEST_CASE(TestConstruction1D)
        }
    }

-    auto cuSparseMatrix = Opm::cuistl::CuSparseMatrix<double>::fromMatrix(B);
+    auto gpuSparseMatrix = Opm::gpuistl::GpuSparseMatrix<double>::fromMatrix(B);

-    const auto& nonZeroValuesCuda = cuSparseMatrix.getNonZeroValues();
-    std::vector<double> buffer(cuSparseMatrix.nonzeroes(), 0.0);
+    const auto& nonZeroValuesCuda = gpuSparseMatrix.getNonZeroValues();
+    std::vector<double> buffer(gpuSparseMatrix.nonzeroes(), 0.0);
    nonZeroValuesCuda.copyToHost(buffer.data(), buffer.size());
    const double* nonZeroElements = static_cast<const double*>(&((B[0][0][0][0])));
    BOOST_CHECK_EQUAL_COLLECTIONS(buffer.begin(), buffer.end(), nonZeroElements, nonZeroElements + B.nonzeroes());
-    BOOST_CHECK_EQUAL(N * 3 - 2, cuSparseMatrix.nonzeroes());
+    BOOST_CHECK_EQUAL(N * 3 - 2, gpuSparseMatrix.nonzeroes());

    std::vector<int> rowIndicesFromCUDA(N + 1);
-    cuSparseMatrix.getRowIndices().copyToHost(rowIndicesFromCUDA.data(), rowIndicesFromCUDA.size());
+    gpuSparseMatrix.getRowIndices().copyToHost(rowIndicesFromCUDA.data(), rowIndicesFromCUDA.size());
    BOOST_CHECK_EQUAL(rowIndicesFromCUDA[0], 0);
    BOOST_CHECK_EQUAL(rowIndicesFromCUDA[1], 2);
    for (int i = 2; i < N; ++i) {
@ -95,7 +95,7 @@ BOOST_AUTO_TEST_CASE(TestConstruction1D)


    std::vector<int> columnIndicesFromCUDA(B.nonzeroes(), 0);
-    cuSparseMatrix.getColumnIndices().copyToHost(columnIndicesFromCUDA.data(), columnIndicesFromCUDA.size());
+    gpuSparseMatrix.getColumnIndices().copyToHost(columnIndicesFromCUDA.data(), columnIndicesFromCUDA.size());

    BOOST_CHECK_EQUAL(columnIndicesFromCUDA[0], 0);
    BOOST_CHECK_EQUAL(columnIndicesFromCUDA[1], 1);
@ -143,19 +143,19 @@ BOOST_AUTO_TEST_CASE(RandomSparsityMatrix)
        }
    }

-    auto cuSparseMatrix = Opm::cuistl::CuSparseMatrix<double>::fromMatrix(B);
+    auto gpuSparseMatrix = Opm::gpuistl::GpuSparseMatrix<double>::fromMatrix(B);
    // check each column
    for (size_t component = 0; component < N; ++component) {
        std::vector<double> inputDataX(N * dim, 0.0);
        inputDataX[component] = 1.0;
        std::vector<double> inputDataY(N * dim, .25);
-        auto inputVectorX = Opm::cuistl::CuVector<double>(inputDataX.data(), inputDataX.size());
-        auto inputVectorY = Opm::cuistl::CuVector<double>(inputDataY.data(), inputDataY.size());
+        auto inputVectorX = Opm::gpuistl::GpuVector<double>(inputDataX.data(), inputDataX.size());
+        auto inputVectorY = Opm::gpuistl::GpuVector<double>(inputDataY.data(), inputDataY.size());
        Vector xHost(N), yHost(N);
        yHost = inputDataY[0];
        inputVectorX.copyToHost(xHost);
        const double alpha = 1.42;
-        cuSparseMatrix.usmv(alpha, inputVectorX, inputVectorY);
+        gpuSparseMatrix.usmv(alpha, inputVectorX, inputVectorY);

        inputVectorY.copyToHost(inputDataY);

@ -167,7 +167,7 @@ BOOST_AUTO_TEST_CASE(RandomSparsityMatrix)
        }
        inputVectorX.copyToHost(xHost);

-        cuSparseMatrix.mv(inputVectorX, inputVectorY);
+        gpuSparseMatrix.mv(inputVectorX, inputVectorY);

        inputVectorY.copyToHost(inputDataY);

--- a/tests/gpuistl/test_GpuVector.cpp
+++ b/tests/gpuistl/test_GpuVector.cpp
@ -18,21 +18,21 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuVector
+#define BOOST_TEST_MODULE TestGpuVector

 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/common/fvector.hh>
 #include <dune/istl/bvector.hh>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 #include <random>

 BOOST_AUTO_TEST_CASE(TestDocumentedUsage)
 {
    auto someDataOnCPU = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});

-    auto dataOnGPU = ::Opm::cuistl::CuVector<double>(someDataOnCPU);
+    auto dataOnGPU = ::Opm::gpuistl::GpuVector<double>(someDataOnCPU);

    // Multiply by 4.0:
    dataOnGPU *= 4.0;
@ -50,14 +50,14 @@ BOOST_AUTO_TEST_CASE(TestDocumentedUsage)
 BOOST_AUTO_TEST_CASE(TestConstructionSize)
 {
    const int numberOfElements = 1234;
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(numberOfElements);
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(numberOfElements);
    BOOST_CHECK_EQUAL(numberOfElements, vectorOnGPU.dim());
 }

 BOOST_AUTO_TEST_CASE(TestCopyFromHostConstructor)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
    std::vector<double> buffer(data.size(), 0.0);
    vectorOnGPU.copyToHost(buffer.data(), buffer.size());
@ -68,7 +68,7 @@ BOOST_AUTO_TEST_CASE(TestCopyFromHostConstructor)
 BOOST_AUTO_TEST_CASE(TestCopyFromHostFunction)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.size());
    BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
    vectorOnGPU.copyFromHost(data.data(), data.size());
    std::vector<double> buffer(data.size(), 0.0);
@ -80,7 +80,7 @@ BOOST_AUTO_TEST_CASE(TestCopyFromHostFunction)
 BOOST_AUTO_TEST_CASE(TestCopyFromBvector)
 {
    auto blockVector = Dune::BlockVector<Dune::FieldVector<double, 2>> {{{42, 43}, {44, 45}, {46, 47}}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(blockVector.dim());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(blockVector.dim());
    vectorOnGPU.copyFromHost(blockVector);
    std::vector<double> buffer(vectorOnGPU.dim());
    vectorOnGPU.copyToHost(buffer.data(), buffer.size());
@ -93,7 +93,7 @@ BOOST_AUTO_TEST_CASE(TestCopyToBvector)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7, 8, 9}};
    auto blockVector = Dune::BlockVector<Dune::FieldVector<double, 3>>(3);
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    vectorOnGPU.copyToHost(blockVector);


@ -103,17 +103,17 @@ BOOST_AUTO_TEST_CASE(TestCopyToBvector)
 BOOST_AUTO_TEST_CASE(TestDataPointer)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7, 8, 9}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());

    std::vector<double> buffer(data.size(), 0.0);
-    OPM_CUDA_SAFE_CALL(cudaMemcpy(buffer.data(), vectorOnGPU.data(), sizeof(double) * data.size(), cudaMemcpyDeviceToHost));
+    OPM_GPU_SAFE_CALL(cudaMemcpy(buffer.data(), vectorOnGPU.data(), sizeof(double) * data.size(), cudaMemcpyDeviceToHost));
    BOOST_CHECK_EQUAL_COLLECTIONS(data.begin(), data.end(), buffer.begin(), buffer.end());
 }

 BOOST_AUTO_TEST_CASE(TestCopyScalarMultiply)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
    const double scalar = 42.25;
    vectorOnGPU *= scalar;
@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(TestCopyScalarMultiply)
 BOOST_AUTO_TEST_CASE(TestTwoNorm)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    auto twoNorm = vectorOnGPU.two_norm();

    double correctAnswer = 0.0;
@ -143,8 +143,8 @@ BOOST_AUTO_TEST_CASE(TestDot)
 {
    std::vector<double> dataA {{1, 2, 3, 4, 5, 6, 7}};
    std::vector<double> dataB {{8, 9, 10, 11, 12, 13, 14}};
-    auto vectorOnGPUA = Opm::cuistl::CuVector<double>(dataA.data(), dataA.size());
-    auto vectorOnGPUB = Opm::cuistl::CuVector<double>(dataB.data(), dataB.size());
+    auto vectorOnGPUA = Opm::gpuistl::GpuVector<double>(dataA.data(), dataA.size());
+    auto vectorOnGPUB = Opm::gpuistl::GpuVector<double>(dataB.data(), dataB.size());
    auto dot = vectorOnGPUA.dot(vectorOnGPUB);

    double correctAnswer = 0.0;
@ -158,7 +158,7 @@ BOOST_AUTO_TEST_CASE(TestDot)
 BOOST_AUTO_TEST_CASE(Assigment)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    vectorOnGPU = 10.0;
    vectorOnGPU.copyToHost(data.data(), data.size());

@ -171,9 +171,9 @@ BOOST_AUTO_TEST_CASE(Assigment)
 BOOST_AUTO_TEST_CASE(CopyAssignment)
 {
    std::vector<double> data {{1, 2, 3, 4, 5, 6, 7}};
-    auto vectorOnGPU = Opm::cuistl::CuVector<double>(data.data(), data.size());
+    auto vectorOnGPU = Opm::gpuistl::GpuVector<double>(data.data(), data.size());
    vectorOnGPU.copyToHost(data.data(), data.size());
-    auto vectorOnGPUB = Opm::cuistl::CuVector<double>(data.size());
+    auto vectorOnGPUB = Opm::gpuistl::GpuVector<double>(data.size());
    vectorOnGPUB = 4.0;
    vectorOnGPUB = vectorOnGPU;

@ -185,7 +185,7 @@ BOOST_AUTO_TEST_CASE(CopyAssignment)
 BOOST_AUTO_TEST_CASE(RandomVectors)
 {

-    using GVector = Opm::cuistl::CuVector<double>;
+    using GVector = Opm::gpuistl::GpuVector<double>;
    std::srand(0);
    std::mt19937 generator;
    std::uniform_real_distribution<double> distribution(-100.0, 100.0);
@ -268,7 +268,7 @@ BOOST_AUTO_TEST_CASE(RandomVectors)
                indexSet.push_back(i);
            }
        }
-        auto indexSetGPU = Opm::cuistl::CuVector<int>(indexSet);
+        auto indexSetGPU = Opm::gpuistl::GpuVector<int>(indexSet);

        aGPU.setZeroAtIndexSet(indexSetGPU);
        auto projectedA = aGPU.asStdVector();
--- a/tests/gpuistl/test_GpuView.cu
+++ b/tests/gpuistl/test_GpuView.cu
@ -18,24 +18,24 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuView
+#define BOOST_TEST_MODULE TestGpuView

 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/common/fvector.hh>
 #include <dune/istl/bvector.hh>
-#include <opm/simulators/linalg/cuistl/CuView.hpp>
-#include <opm/simulators/linalg/cuistl/CuBuffer.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuBuffer.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 #include <random>
 #include <array>
 #include <algorithm>
 #include <type_traits>

-using CuViewDouble = ::Opm::cuistl::CuView<double>;
-using CuBufferDouble = ::Opm::cuistl::CuBuffer<double>;
+using GpuViewDouble = ::Opm::gpuistl::GpuView<double>;
+using GpuBufferDouble = ::Opm::gpuistl::GpuBuffer<double>;

-__global__ void useCuViewOnGPU(CuViewDouble a, CuViewDouble b){
+__global__ void useGpuViewOnGPU(GpuViewDouble a, GpuViewDouble b){
    b[0] = a.front();
    b[1] = a.back();
    b[2] = *a.begin();
@ -48,24 +48,24 @@ BOOST_AUTO_TEST_CASE(TestCreationAndIndexing)
 {
    // A simple test to check that we can move data to and from the GPU
    auto cpubuffer = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
-    auto cubuffer = CuBufferDouble(cpubuffer);
-    auto cuview = CuViewDouble(cubuffer.data(), cubuffer.size());
-    const auto const_cuview = CuViewDouble(cubuffer.data(), cubuffer.size());
+    auto cubuffer = GpuBufferDouble(cpubuffer);
+    auto gpuview = GpuViewDouble(cubuffer.data(), cubuffer.size());
+    const auto const_gpuview = GpuViewDouble(cubuffer.data(), cubuffer.size());

-    auto stdVecOfCuView = cuview.asStdVector();
-    auto const_stdVecOfCuView = cuview.asStdVector();
+    auto stdVecOfGpuView = gpuview.asStdVector();
+    auto const_stdVecOfGpuView = gpuview.asStdVector();

    BOOST_CHECK_EQUAL_COLLECTIONS(
-        stdVecOfCuView.begin(), stdVecOfCuView.end(), cpubuffer.begin(), cpubuffer.end());
+        stdVecOfGpuView.begin(), stdVecOfGpuView.end(), cpubuffer.begin(), cpubuffer.end());
    BOOST_CHECK_EQUAL_COLLECTIONS(
-        stdVecOfCuView.begin(), stdVecOfCuView.end(), const_stdVecOfCuView.begin(), const_stdVecOfCuView.end());
+        stdVecOfGpuView.begin(), stdVecOfGpuView.end(), const_stdVecOfGpuView.begin(), const_stdVecOfGpuView.end());
 }

-BOOST_AUTO_TEST_CASE(TestCuViewOnCPUTypes)
+BOOST_AUTO_TEST_CASE(TestGpuViewOnCPUTypes)
 {
    auto buf = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
-    auto cpuview = CuViewDouble(buf.data(), buf.size());
-    const auto const_cpuview = CuViewDouble(buf.data(), buf.size());
+    auto cpuview = GpuViewDouble(buf.data(), buf.size());
+    const auto const_cpuview = GpuViewDouble(buf.data(), buf.size());

    // check that indexing a mutable view gives references when indexing it
    bool correct_type_of_cpu_front = std::is_same_v<double&, decltype(cpuview.front())>;
@ -83,26 +83,26 @@ BOOST_AUTO_TEST_CASE(TestCuViewOnCPUTypes)
    BOOST_CHECK(cpuview.back() == buf.back());
 }

-BOOST_AUTO_TEST_CASE(TestCuViewOnCPUWithSTLIteratorAlgorithm)
+BOOST_AUTO_TEST_CASE(TestGpuViewOnCPUWithSTLIteratorAlgorithm)
 {
    auto buf = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
-    auto cpuview = CuViewDouble(buf.data(), buf.size());
+    auto cpuview = GpuViewDouble(buf.data(), buf.size());
    std::sort(buf.begin(), buf.end());
    BOOST_CHECK(42.0 == cpuview[3]);
 }

-BOOST_AUTO_TEST_CASE(TestCuViewOnGPU)
+BOOST_AUTO_TEST_CASE(TestGpuViewOnGPU)
 {
    auto buf = std::vector<double>({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
-    auto cubufA = CuBufferDouble(buf);
-    auto cuviewA = CuViewDouble(cubufA.data(), cubufA.size());
-    auto cubufB = CuBufferDouble(4);
-    auto cuviewB = CuViewDouble(cubufB.data(), cubufB.size());
+    auto cubufA = GpuBufferDouble(buf);
+    auto gpuviewA = GpuViewDouble(cubufA.data(), cubufA.size());
+    auto cubufB = GpuBufferDouble(4);
+    auto gpuviewB = GpuViewDouble(cubufB.data(), cubufB.size());

-    useCuViewOnGPU<<<1,1>>>(cuviewA, cuviewB);
+    useGpuViewOnGPU<<<1,1>>>(gpuviewA, gpuviewB);

-    auto vecA = cuviewA.asStdVector();
-    auto vecB = cuviewB.asStdVector();
+    auto vecA = gpuviewA.asStdVector();
+    auto vecB = gpuviewB.asStdVector();

    // checks that front/back/begin/end works
    BOOST_CHECK(vecB[0] == buf[0]);
--- a/tests/gpuistl/test_converttofloatadapter.cpp
+++ b/tests/gpuistl/test_converttofloatadapter.cpp
@ -29,7 +29,7 @@
 #include <dune/istl/preconditioners.hh>
 #include <limits>
 #include <memory>
-#include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerConvertFieldTypeAdapter.hpp>


 using XDouble = Dune::BlockVector<Dune::FieldVector<double, 2>>;
@ -167,7 +167,7 @@ BOOST_AUTO_TEST_CASE(TestFiniteDifference1D)
        expectedOutputVector[i][1] = 43.0;
        inputVector[i][0] = 1.0;
        auto converter
-            = Opm::cuistl::PreconditionerConvertFieldTypeAdapter<TestPreconditioner, SpMatrixDouble, XDouble, XDouble>(
+            = Opm::gpuistl::PreconditionerConvertFieldTypeAdapter<TestPreconditioner, SpMatrixDouble, XDouble, XDouble>(
                B);
        auto underlyingPreconditioner = std::make_shared<TestPreconditioner>(
            converter.getConvertedMatrix(), inputVector, B, expectedOutputVector);
--- a/tests/gpuistl/test_cuSparse_matrix_operations.cpp
+++ b/tests/gpuistl/test_cuSparse_matrix_operations.cpp
@ -18,17 +18,17 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuSparseMatrixOperations
+#define BOOST_TEST_MODULE TestGpuSparseMatrixOperations
 #include <boost/mpl/list.hpp>
 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/istl/bcrsmatrix.hh>
-#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
-#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/preconditionerKernels/JacKernels.hpp>

 using NumericTypes = boost::mpl::list<double, float>;

@ -85,10 +85,10 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
    B[1][1][1][1] = -1.0;
    B[1][1][2][2] = -1.0;

-    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
-    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
+    Opm::gpuistl::GpuSparseMatrix<T> m = Opm::gpuistl::GpuSparseMatrix<T>::fromMatrix(B);
+    Opm::gpuistl::GpuVector<T> dInvDiag(blocksize * blocksize * N);

-    Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
+    Opm::gpuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());

    std::vector<T> expectedInvDiag {-1.0 / 4.0,
@ -159,10 +159,10 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
    B[1][1][0][0] = -1.0;
    B[1][1][1][1] = -1.0;

-    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
-    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
+    Opm::gpuistl::GpuSparseMatrix<T> m = Opm::gpuistl::GpuSparseMatrix<T>::fromMatrix(B);
+    Opm::gpuistl::GpuVector<T> dInvDiag(blocksize * blocksize * N);

-    Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
+    Opm::gpuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());

    std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};
--- a/tests/gpuistl/test_cuVector_operations.cpp
+++ b/tests/gpuistl/test_cuVector_operations.cpp
@ -18,16 +18,16 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCuVectorOperations
+#define BOOST_TEST_MODULE TestGpuVectorOperations
 #include <boost/mpl/list.hpp>
 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
 #include <dune/istl/bcrsmatrix.hh>
-#include <opm/simulators/linalg/cuistl/CuJac.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuJac.hpp>
+#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/vector_operations.hpp>

 using NumericTypes = boost::mpl::list<double, float>;

@ -47,11 +47,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVecto
    std::vector<T> hostBlockVector({1.0, 2.0, 3.0, 5.0, 2.0, 3.0, 2.0, 1.0, 2.0});
    std::vector<T> hostVecVector({3.0, 2.0, 1.0});
    std::vector<T> hostDstVector({0, 0, 0});
-    Opm::cuistl::CuVector<T> deviceBlockVector(hostBlockVector);
-    Opm::cuistl::CuVector<T> deviceVecVector(hostVecVector);
-    Opm::cuistl::CuVector<T> deviceDstVector(hostDstVector);
+    Opm::gpuistl::GpuVector<T> deviceBlockVector(hostBlockVector);
+    Opm::gpuistl::GpuVector<T> deviceVecVector(hostVecVector);
+    Opm::gpuistl::GpuVector<T> deviceDstVector(hostDstVector);

-    Opm::cuistl::detail::weightedDiagMV(
+    Opm::gpuistl::detail::weightedDiagMV(
        deviceBlockVector.data(), N, blocksize, weight, deviceVecVector.data(), deviceDstVector.data());

    std::vector<T> expectedVec {10.0, 22.0, 10.0};
@ -81,11 +81,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
    std::vector<T> hostBlockVector({1.0, 2.0, 3.0, 4.0, 4.0, 3.0, 2.0, 1.0});
    std::vector<T> hostVecVector({1.0, 3.0, 2.0, 4.0});
    std::vector<T> hostDstVector({0, 0, 0, 0});
-    Opm::cuistl::CuVector<T> deviceBlockVector(hostBlockVector);
-    Opm::cuistl::CuVector<T> deviceVecVector(hostVecVector);
-    Opm::cuistl::CuVector<T> deviceDstVector(hostDstVector);
+    Opm::gpuistl::GpuVector<T> deviceBlockVector(hostBlockVector);
+    Opm::gpuistl::GpuVector<T> deviceVecVector(hostVecVector);
+    Opm::gpuistl::GpuVector<T> deviceDstVector(hostDstVector);

-    Opm::cuistl::detail::weightedDiagMV(
+    Opm::gpuistl::detail::weightedDiagMV(
        deviceBlockVector.data(), N, blocksize, weight, deviceVecVector.data(), deviceDstVector.data());

    std::vector<T> expectedVec {3.5, 7.5, 10.0, 4.0};
@ -95,4 +95,4 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
    for (size_t i = 0; i < expectedVec.size(); i++) {
        BOOST_CHECK_CLOSE(expectedVec[i], computedVec[i], 1e-7);
    }
-}
+}
--- a/tests/gpuistl/test_cublas_handle.cpp
+++ b/tests/gpuistl/test_cublas_handle.cpp
@ -16,14 +16,14 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp"
+#include "opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp"
 #include <config.h>

 #define BOOST_TEST_MODULE TestCublasHandle

 #include <cuda_runtime.h>
 #include <boost/test/unit_test.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuBlasHandle.hpp>

 BOOST_AUTO_TEST_CASE(TestGetCublasVersion)
 {
@ -32,7 +32,7 @@ BOOST_AUTO_TEST_CASE(TestGetCublasVersion)
    // that checks the version of blas programatically. Let the test pass for now.
    BOOST_CHECK(true);
 #else
-    auto& cublasHandle = ::Opm::cuistl::detail::CuBlasHandle::getInstance();
+    auto& cublasHandle = ::Opm::gpuistl::detail::CuBlasHandle::getInstance();
    int cuBlasVersion = -1;
    OPM_CUBLAS_SAFE_CALL(cublasGetVersion(cublasHandle.get(), &cuBlasVersion));

--- a/tests/gpuistl/test_cublas_safe_call.cpp
+++ b/tests/gpuistl/test_cublas_safe_call.cpp
@ -22,7 +22,7 @@

 #include <boost/test/unit_test.hpp>
 #include <cublas_v2.h>
-#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cublas_safe_call.hpp>

 BOOST_AUTO_TEST_CASE(TestCreateHandle)
 {
--- a/tests/gpuistl/test_cuda_check_last_error.cpp
+++ b/tests/gpuistl/test_cuda_check_last_error.cpp
@ -21,7 +21,7 @@
 #define BOOST_TEST_MODULE TestCudaCheckLastError

 #include <boost/test/unit_test.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cuda_check_last_error.hpp>


 BOOST_AUTO_TEST_CASE(TestNoThrowLastError)
--- a/tests/gpuistl/test_cusparse_handle.cpp
+++ b/tests/gpuistl/test_cusparse_handle.cpp
@ -21,12 +21,12 @@
 #define BOOST_TEST_MODULE TestSparseHandle

 #include <boost/test/unit_test.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>

 BOOST_AUTO_TEST_CASE(TestGetSparseVersion)
 {
-    auto& cuSparseHandle = ::Opm::cuistl::detail::CuSparseHandle::getInstance();
+    auto& cuSparseHandle = ::Opm::gpuistl::detail::CuSparseHandle::getInstance();
    int cuSparseVersion = -1;
    OPM_CUSPARSE_SAFE_CALL(cusparseGetVersion(cuSparseHandle.get(), &cuSparseVersion));
    BOOST_CHECK_LT(0, cuSparseVersion);
--- a/tests/gpuistl/test_cusparse_safe_call.cpp
+++ b/tests/gpuistl/test_cusparse_safe_call.cpp
@ -22,7 +22,7 @@

 #include <boost/test/unit_test.hpp>
 #include <cusparse.h>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/cusparse_safe_call.hpp>

 BOOST_AUTO_TEST_CASE(TestCreateHandle)
 {
--- a/tests/gpuistl/test_gpu_safe_call.cpp
+++ b/tests/gpuistl/test_gpu_safe_call.cpp
@ -18,15 +18,15 @@
 */
 #include <config.h>

-#define BOOST_TEST_MODULE TestCudaSafeCall
+#define BOOST_TEST_MODULE TestGpuSafeCall
 #include <boost/test/unit_test.hpp>
 #include <cuda_runtime.h>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>

-BOOST_AUTO_TEST_CASE(TestCudaMalloc)
+BOOST_AUTO_TEST_CASE(TestGpuMalloc)
 {
    void* pointer;
-    BOOST_CHECK_NO_THROW(OPM_CUDA_SAFE_CALL(cudaMalloc(&pointer, 1)););
+    BOOST_CHECK_NO_THROW(OPM_GPU_SAFE_CALL(cudaMalloc(&pointer, 1)););
 }


@ -41,6 +41,6 @@ BOOST_AUTO_TEST_CASE(TestThrows)
     errorCodes = {{cudaErrorAddressOfConstant, cudaErrorAlreadyAcquired}};
 #endif
    for (auto code : errorCodes) {
-        BOOST_CHECK_THROW(OPM_CUDA_SAFE_CALL(code), std::exception);
+        BOOST_CHECK_THROW(OPM_GPU_SAFE_CALL(code), std::exception);
    }
 }
--- a/tests/gpuistl/test_safe_conversion.cpp
+++ b/tests/gpuistl/test_safe_conversion.cpp
@ -21,11 +21,11 @@
 #define BOOST_TEST_MODULE TestSafeConversion

 #include <boost/test/unit_test.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/safe_conversion.hpp>

 BOOST_AUTO_TEST_CASE(TestToIntThrowsOutofRange)
 {
-    BOOST_CHECK_THROW(Opm::cuistl::detail::to_int(size_t(std::numeric_limits<int>::max()) + size_t(1));
+    BOOST_CHECK_THROW(Opm::gpuistl::detail::to_int(size_t(std::numeric_limits<int>::max()) + size_t(1));
                      , std::invalid_argument);
 }

@ -33,26 +33,26 @@ BOOST_AUTO_TEST_CASE(TestToIntConvertInRange)
 {
    // This might seem slow, but it is really fast:
    for (size_t i = 0; i <= size_t(1024 * 1024); ++i) {
-        BOOST_CHECK_EQUAL(int(i), Opm::cuistl::detail::to_int(i));
+        BOOST_CHECK_EQUAL(int(i), Opm::gpuistl::detail::to_int(i));
    }

    BOOST_CHECK_EQUAL(std::numeric_limits<int>::max(),
-                      Opm::cuistl::detail::to_int(size_t(std::numeric_limits<int>::max())));
+                      Opm::gpuistl::detail::to_int(size_t(std::numeric_limits<int>::max())));
 }


 BOOST_AUTO_TEST_CASE(TestToSizeTThrowsOutofRange)
 {
-    BOOST_CHECK_THROW(Opm::cuistl::detail::to_size_t(-1);, std::invalid_argument);
+    BOOST_CHECK_THROW(Opm::gpuistl::detail::to_size_t(-1);, std::invalid_argument);
 }

 BOOST_AUTO_TEST_CASE(TestToSizeTConvertInRange)
 {
    // This might seem slow, but it is really fast:
    for (int i = 0; i <= 1024 * 1024; ++i) {
-        BOOST_CHECK_EQUAL(size_t(i), Opm::cuistl::detail::to_size_t(i));
+        BOOST_CHECK_EQUAL(size_t(i), Opm::gpuistl::detail::to_size_t(i));
    }

    BOOST_CHECK_EQUAL(size_t(std::numeric_limits<int>::max()),
-                      Opm::cuistl::detail::to_size_t(std::numeric_limits<int>::max()));
+                      Opm::gpuistl::detail::to_size_t(std::numeric_limits<int>::max()));
 }
--- a/tests/gpuistl/test_solver_adapter.cpp
+++ b/tests/gpuistl/test_solver_adapter.cpp
@ -24,14 +24,14 @@
 #include <dune/istl/solvers.hh>
 #include <opm/simulators/linalg/PreconditionerFactory.hpp>
 #include <opm/simulators/linalg/PropertyTree.hpp>
-#include <opm/simulators/linalg/cuistl/SolverAdapter.hpp>
+#include <opm/simulators/linalg/gpuistl/SolverAdapter.hpp>

 static const constexpr int dim = 3;
 using Matrix = Dune::BCRSMatrix<Dune::FieldMatrix<double, dim, dim>>;
 using Vector = Dune::BlockVector<Dune::FieldVector<double, dim>>;
 using Moperator = Dune::MatrixAdapter<Matrix, Vector, Vector>;
 using PrecondFactory = Opm::PreconditionerFactory<Moperator, Dune::Amg::SequentialInformation>;
-using SolverAdapter = Opm::cuistl::SolverAdapter<Moperator, Dune::BiCGSTABSolver, Vector>;
+using SolverAdapter = Opm::gpuistl::SolverAdapter<Moperator, Dune::BiCGSTABSolver, Vector>;

 namespace
 {