diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 24436dad2..2b531c1ae 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -147,6 +147,9 @@ if(CUDA_FOUND)
# CUISTL SOURCE
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.cpp)
+ list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp)
+ list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu)
+
# CUISTL HEADERS
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp)
@@ -155,6 +158,7 @@ if(CUDA_FOUND)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp)
+ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuVector.hpp)
endif()
if(OPENCL_FOUND)
@@ -239,8 +243,7 @@ if(CUDA_FOUND)
list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuda_check_last_error.cpp)
list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cublas_handle.cpp)
list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_handle.cpp)
-
-
+ list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuvector.cpp)
endif()
if(OPENCL_FOUND)
list(APPEND TEST_SOURCE_FILES tests/test_openclSolver.cpp)
diff --git a/opm/simulators/linalg/cuistl/CuVector.cpp b/opm/simulators/linalg/cuistl/CuVector.cpp
new file mode 100644
index 000000000..609ab2f38
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/CuVector.cpp
@@ -0,0 +1,268 @@
+/*
+ Copyright 2022-2023 SINTEF AS
+
+ This file is part of the Open Porous Media project (OPM).
+
+ OPM is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ OPM is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with OPM. If not, see .
+*/
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define CHECKSIZE(x) \
+ if (x.m_numberOfElements != m_numberOfElements) { \
+ OPM_THROW(std::invalid_argument, \
+ fmt::format("Given vector has {}, while we have {}.", x.m_numberOfElements, m_numberOfElements)); \
+ }
+#define CHECKPOSITIVESIZE \
+ if (m_numberOfElements <= 0) { \
+ OPM_THROW(std::invalid_argument, "We have 0 elements"); \
+ }
+
+namespace Opm::cuistl
+{
+
+template
+CuVector::CuVector(const std::vector& data)
+ : CuVector(data.data(), data.size())
+{
+}
+
+template
+CuVector::CuVector(const int numberOfElements)
+ : m_numberOfElements(numberOfElements)
+ , m_cuBlasHandle(detail::CuBlasHandle::getInstance())
+{
+ OPM_CUDA_SAFE_CALL(cudaMalloc(&m_dataOnDevice, sizeof(T) * m_numberOfElements));
+}
+
+template
+CuVector::CuVector(const T* dataOnHost, const int numberOfElements)
+ : CuVector(numberOfElements)
+{
+
+ OPM_CUDA_SAFE_CALL(cudaMemcpy(m_dataOnDevice, dataOnHost, m_numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
+}
+
+template
+CuVector&
+CuVector::operator=(T scalar)
+{
+ CHECKPOSITIVESIZE
+ detail::setVectorValue(data(), m_numberOfElements, scalar);
+ return *this;
+}
+
+template
+CuVector&
+CuVector::operator=(const CuVector& other)
+{
+ CHECKPOSITIVESIZE
+ CHECKSIZE(other)
+ if (other.m_numberOfElements != m_numberOfElements) {
+ OPM_THROW(std::invalid_argument, "Can only copy from vector of same size.");
+ }
+ OPM_CUDA_SAFE_CALL(
+ cudaMemcpy(m_dataOnDevice, other.m_dataOnDevice, m_numberOfElements * sizeof(T), cudaMemcpyDeviceToDevice));
+ return *this;
+}
+
+template
+CuVector::CuVector(const CuVector& other)
+ : CuVector(other.m_numberOfElements)
+{
+ CHECKPOSITIVESIZE
+ CHECKSIZE(other)
+ OPM_CUDA_SAFE_CALL(
+ cudaMemcpy(m_dataOnDevice, other.m_dataOnDevice, m_numberOfElements * sizeof(T), cudaMemcpyDeviceToDevice));
+}
+
+template
+CuVector::~CuVector()
+{
+ OPM_CUDA_SAFE_CALL(cudaFree(m_dataOnDevice));
+}
+
+template
+const T*
+CuVector::data() const
+{
+ return m_dataOnDevice;
+}
+
+template
+typename CuVector::size_type
+CuVector::dim() const
+{
+ return m_numberOfElements;
+}
+
+template
+std::vector
+CuVector::asStdVector() const
+{
+ std::vector temporary(m_numberOfElements);
+ copyToHost(temporary);
+ return temporary;
+}
+
+template
+void
+CuVector::setZeroAtIndexSet(const CuVector& indexSet)
+{
+ detail::setZeroAtIndexSet(m_dataOnDevice, indexSet.dim(), indexSet.data());
+}
+
+template
+T*
+CuVector::data()
+{
+ return m_dataOnDevice;
+}
+
+template
+CuVector&
+CuVector::operator*=(const T& scalar)
+{
+ CHECKPOSITIVESIZE
+ OPM_CUBLAS_SAFE_CALL(detail::cublasScal(m_cuBlasHandle.get(), m_numberOfElements, &scalar, data(), 1));
+ return *this;
+}
+
+template
+CuVector&
+CuVector::axpy(T alpha, const CuVector& y)
+{
+ CHECKPOSITIVESIZE
+ CHECKSIZE(y)
+ OPM_CUBLAS_SAFE_CALL(detail::cublasAxpy(m_cuBlasHandle.get(), m_numberOfElements, &alpha, y.data(), 1, data(), 1));
+ return *this;
+}
+
+template
+T
+CuVector::dot(const CuVector& other) const
+{
+ CHECKPOSITIVESIZE
+ CHECKSIZE(other)
+ T result = T(0);
+ OPM_CUBLAS_SAFE_CALL(
+ detail::cublasDot(m_cuBlasHandle.get(), m_numberOfElements, data(), 1, other.data(), 1, &result));
+ return result;
+}
+template
+T
+CuVector::two_norm() const
+{
+ CHECKPOSITIVESIZE
+ T result = T(0);
+ OPM_CUBLAS_SAFE_CALL(detail::cublasNrm2(m_cuBlasHandle.get(), m_numberOfElements, data(), 1, &result));
+ return result;
+}
+
+template
+T
+CuVector::dot(const CuVector& other, const CuVector& indexSet, CuVector& buffer) const
+{
+ return detail::innerProductAtIndices(m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data());
+}
+
+template
+T
+CuVector::two_norm(const CuVector& indexSet, CuVector& buffer) const
+{
+ // TODO: [perf] Optimize this to a single call
+ return std::sqrt(this->dot(*this, indexSet, buffer));
+}
+
+template
+T
+CuVector::dot(const CuVector& other, const CuVector& indexSet) const
+{
+ CuVector buffer(indexSet.dim());
+ return detail::innerProductAtIndices(m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data());
+}
+
+template
+T
+CuVector::two_norm(const CuVector& indexSet) const
+{
+ CuVector buffer(indexSet.dim());
+ // TODO: [perf] Optimize this to a single call
+ return std::sqrt(this->dot(*this, indexSet, buffer));
+}
+template
+CuVector&
+CuVector::operator+=(const CuVector& other)
+{
+ CHECKPOSITIVESIZE
+ CHECKSIZE(other)
+ // TODO: [perf] Make a specialized version of this
+ return axpy(1.0, other);
+}
+
+template
+CuVector&
+CuVector::operator-=(const CuVector& other)
+{
+ CHECKPOSITIVESIZE
+ CHECKSIZE(other)
+ // TODO: [perf] Make a specialized version of this
+ return axpy(-1.0, other);
+}
+
+
+template
+void
+CuVector::copyFromHost(const T* dataPointer, int numberOfElements)
+{
+ if (numberOfElements > dim()) {
+ OPM_THROW(std::runtime_error,
+ fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
+ dim(),
+ numberOfElements));
+ }
+ OPM_CUDA_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
+}
+
+template
+void
+CuVector::copyToHost(T* dataPointer, int numberOfElements) const
+{
+ OPM_CUDA_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost));
+}
+
+template
+void
+CuVector::copyFromHost(const std::vector& data)
+{
+ copyFromHost(data.data(), data.size());
+}
+template
+void
+CuVector::copyToHost(std::vector& data) const
+{
+ copyToHost(data.data(), data.size());
+}
+template class CuVector;
+template class CuVector;
+template class CuVector;
+
+} // namespace Opm::cuistl
diff --git a/opm/simulators/linalg/cuistl/CuVector.hpp b/opm/simulators/linalg/cuistl/CuVector.hpp
new file mode 100644
index 000000000..d7a3e7a60
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/CuVector.hpp
@@ -0,0 +1,335 @@
+/*
+ Copyright 2022-2023 SINTEF AS
+
+ This file is part of the Open Porous Media project (OPM).
+
+ OPM is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ OPM is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with OPM. If not, see .
+*/
+#ifndef OPM_CUVECTOR_HEADER_HPP
+#define OPM_CUVECTOR_HEADER_HPP
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace Opm::cuistl
+{
+
+/**
+ * @brief The CuVector class is a simple (arithmetic) vector class for the GPU.
+ *
+ * @note we currently only support simple raw primitives for T (double and float)
+ *
+ * @note this vector has no notion of block size. The user is responsible for allocating
+ * the correct number of primitives (double or floats)
+ * Example usage:
+ *
+ * @code{.cpp}
+ * #include
+ *
+ * void someFunction() {
+ * auto someDataOnCPU = std::vector({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
+ *
+ * auto dataOnGPU = CuVector(someDataOnCPU);
+ *
+ * // Multiply by 4.0:
+ * dataOnGPU *= 4.0;
+ *
+ * // Get data back on CPU in another vector:
+ * auto stdVectorOnCPU = dataOnGPU.asStdVector();
+ * }
+ *
+ * @tparam T the type to store. Can be either float, double or int.
+ */
+template
+class CuVector
+{
+public:
+ using field_type = T;
+ using size_type = size_t;
+
+
+ /**
+ * @brief CuVector allocates new GPU memory of the same size as other and copies the content of the other vector to
+ * this newly allocated memory.
+ *
+ * @note This does synchronous transfer.
+ *
+ * @param other the vector to copy from
+ */
+ CuVector(const CuVector& other);
+
+ /**
+ * @brief CuVector allocates new GPU memory of the same size as data and copies the content of the data vector to
+ * this newly allocated memory.
+ *
+ * @note This does CPU to GPU transfer.
+ * @note This does synchronous transfer.
+ *
+ * @param data the vector to copy from
+ */
+ explicit CuVector(const std::vector& data);
+
+ /**
+ * @brief operator= copies the content of the data vector to the memory of this vector.
+ *
+ * @note This requires the two vectors to be of the same size.
+ * @note This does synchronous transfer.
+ *
+ * @param other the vector to copy from
+ */
+ CuVector& operator=(const CuVector& other);
+
+ /**
+ * @brief operator= sets the whole vector equal to the scalar value.
+ *
+ * @note This does asynchronous operations
+ *
+ * @param scalar the value all elements will be set to.
+ */
+ CuVector& operator=(T scalar);
+
+ /**
+ * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T)
+ *
+ * @param numberOfElements number of T elements to allocate
+ */
+ explicit CuVector(const int numberOfElements);
+
+
+ /**
+ * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from
+ * data
+ *
+ * @note This assumes the data is on the CPU.
+ *
+ * @param numberOfElements number of T elements to allocate
+ * @param dataOnHost data on host/CPU
+ */
+ CuVector(const T* dataOnHost, const int numberOfElements);
+
+ /**
+ * @brief ~CuVector calls cudaFree
+ */
+ virtual ~CuVector();
+
+ /**
+ * @return the raw pointer to the GPU data
+ */
+ T* data();
+
+ /**
+ * @return the raw pointer to the GPU data
+ */
+ const T* data() const;
+
+ /**
+ * @brief copyFromHost copies data from a Dune::BlockVector
+ * @param vector the vector to copy from
+ *
+ * @note This does synchronous transfer.
+ * @note This assumes that the size of this vector is equal to the dim of the input vector.
+ */
+ template
+ void copyFromHost(const Dune::BlockVector>& vector)
+ {
+ if (m_numberOfElements != vector.dim()) {
+ OPM_THROW(std::runtime_error,
+ fmt::format("Given incompatible vector size. CuVector has size {}, \n"
+ "however, BlockVector has N() = {}, and dim = {}.",
+ m_numberOfElements,
+ vector.N(),
+ vector.dim()));
+ }
+ const auto dataPointer = static_cast(&(vector[0][0]));
+ copyFromHost(dataPointer, m_numberOfElements);
+ }
+
+ /**
+ * @brief copyToHost copies data to a Dune::BlockVector
+ * @param vector the vector to copy to
+ *
+ * @note This does synchronous transfer.
+ * @note This assumes that the size of this vector is equal to the dim of the input vector.
+ */
+ template
+ void copyToHost(Dune::BlockVector>& vector) const
+ {
+ if (m_numberOfElements != vector.dim()) {
+ OPM_THROW(std::runtime_error,
+ fmt::format("Given incompatible vector size. CuVector has size {},\n however, the BlockVector "
+ "has has N() = {}, and dim() = {}.",
+ m_numberOfElements,
+ vector.N(),
+ vector.dim()));
+ }
+ const auto dataPointer = static_cast(&(vector[0][0]));
+ copyToHost(dataPointer, m_numberOfElements);
+ }
+
+ /**
+ * @brief copyFromHost copies numberOfElements from the CPU memory dataPointer
+ * @param dataPointer raw pointer to CPU memory
+ * @param numberOfElements number of elements to copy
+ * @note This does synchronous transfer.
+ * @note assumes that this vector has numberOfElements elements
+ */
+ void copyFromHost(const T* dataPointer, int numberOfElements);
+
+ /**
+ * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
+ * @param dataPointer raw pointer to CPU memory
+ * @param numberOfElements number of elements to copy
+ * @note This does synchronous transfer.
+ * @note assumes that this vector has numberOfElements elements
+ */
+ void copyToHost(T* dataPointer, int numberOfElements) const;
+
+ /**
+ * @brief copyToHost copies data from an std::vector
+ * @param data the vector to copy from
+ *
+ * @note This does synchronous transfer.
+ * @note This assumes that the size of this vector is equal to the size of the input vector.
+ */
+ void copyFromHost(const std::vector& data);
+
+ /**
+ * @brief copyToHost copies data to an std::vector
+ * @param data the vector to copy to
+ *
+ * @note This does synchronous transfer.
+ * @note This assumes that the size of this vector is equal to the size of the input vector.
+ */
+ void copyToHost(std::vector& data) const;
+
+ /**
+ * @brief operator *= multiplies every element by scalar
+ * @param scalar the scalar to with which to multiply every element
+ *
+ * @note This operation is asynchronous.
+ */
+ CuVector& operator*=(const T& scalar);
+
+ /**
+ * @brief axpy sets this vector equal to this + alha * y
+ * @param alpha the scalar with which to multiply y
+ * @param y input vector of same size as this
+ *
+ * @note this will call CuBlas in the background
+ * @note int is not supported
+ */
+ CuVector& axpy(T alpha, const CuVector& y);
+
+ /**
+ * @brief operator+= adds the other vector to this vector
+ *
+ * @note this will call CuBlas in the background
+ * @note int is not supported
+ */
+ CuVector& operator+=(const CuVector& other);
+
+ /**
+ * @brief operator-= subtracts the other vector from this vector
+ *
+ * @note this will call CuBlas in the background
+ * @note int is not supported
+ */
+ CuVector& operator-=(const CuVector& other);
+
+ /**
+ * @brief dot computes the dot product (standard inner product) against the other vector
+ * @param other vector of same size as this
+ * @note this will call CuBlas in the background
+ * @note int is not supported
+ *
+ * @return the result on the inner product
+ */
+ T dot(const CuVector& other) const;
+
+ /**
+ * @brief returns the l2 norm of the vector
+ * @note this will call CuBlas in the background
+ * @note int is not supported
+ *
+ * @return the l2 norm
+ */
+ T two_norm() const;
+
+ //! Computes the dot product sum_i this[indexSet[i]] * other[indexSet[i]]
+ T dot(const CuVector& other, const CuVector& indexSet, CuVector& buffer) const;
+ //! Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]])
+ T two_norm(const CuVector& indexSet, CuVector& buffer) const;
+ //! Computes the dot product sum_i this[indexSet[i]] * other[indexSet[i]]
+ T dot(const CuVector& other, const CuVector& indexSet) const;
+ //! Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]])
+ T two_norm(const CuVector& indexSet) const;
+
+
+ /**
+ * @brief dim returns the dimension (number of T elements) in the vector
+ * @return number of elements
+ */
+ size_type dim() const;
+
+
+ /**
+ * @brief creates an std::vector of the same size and copies the GPU data to this std::vector
+ * @return an std::vector containing the elements copied from the GPU.
+ */
+ std::vector asStdVector() const;
+
+ /**
+ * @brief creates an std::vector of the same size and copies the GPU data to this std::vector
+ * @return an std::vector containing the elements copied from the GPU.
+ */
+ template
+ Dune::BlockVector> asDuneBlockVector() const
+ {
+ OPM_ERROR_IF(dim() % blockSize != 0,
+ fmt::format("blockSize is not a multiple of dim(). Given blockSize = {}, and dim() = {}",
+ blockSize,
+ dim()));
+
+ Dune::BlockVector> returnValue(dim() / blockSize);
+ copyToHost(returnValue);
+ return returnValue;
+ }
+
+
+ /**
+ * @brief setZeroAtIndexSet for each element in indexSet, sets the index of this vector to be zero
+ * @param indexSet the set of indices to set to zero
+ *
+ * @note Assumes all indices are within range
+ *
+ * This is supposed to do the same as the following code on the CPU:
+ * @code{.cpp}
+ * for (int index : indexSet) {
+ * this->data[index] = T(0.0);
+ * }
+ * @endcode
+ */
+ void setZeroAtIndexSet(const CuVector& indexSet);
+
+private:
+ T* m_dataOnDevice = nullptr;
+ const int m_numberOfElements;
+ detail::CuBlasHandle& m_cuBlasHandle;
+};
+} // namespace Opm::cuistl
+#endif
diff --git a/opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp b/opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp
new file mode 100644
index 000000000..e72bf5115
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp
@@ -0,0 +1,168 @@
+/*
+ Copyright 2022-2023 SINTEF AS
+
+ This file is part of the Open Porous Media project (OPM).
+
+ OPM is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ OPM is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with OPM. If not, see .
+*/
+
+/**
+ * Contains wrappers to make the CuBLAS library behave as a modern C++ library with function overlading.
+ *
+ * In simple terms, this allows one to call say cublasScal on both double and single precisision,
+ * instead of calling cublasDscal and cublasSscal respectively.
+ */
+
+#ifndef OPM_CUBLASWRAPPER_HEADER_INCLUDED
+#define OPM_CUBLASWRAPPER_HEADER_INCLUDED
+#include
+#include
+
+namespace Opm::cuistl::detail
+{
+
+inline cublasStatus_t
+cublasScal(cublasHandle_t handle,
+ int n,
+ const double* alpha, /* host or device pointer */
+ double* x,
+ int incx)
+{
+ return cublasDscal(handle,
+ n,
+ alpha, /* host or device pointer */
+ x,
+ incx);
+}
+
+inline cublasStatus_t
+cublasScal(cublasHandle_t handle,
+ int n,
+ const float* alpha, /* host or device pointer */
+ float* x,
+ int incx)
+{
+ return cublasSscal(handle,
+ n,
+ alpha, /* host or device pointer */
+ x,
+ incx);
+}
+
+inline cublasStatus_t
+cublasScal([[maybe_unused]] cublasHandle_t handle,
+ [[maybe_unused]] int n,
+ [[maybe_unused]] const int* alpha, /* host or device pointer */
+ [[maybe_unused]] int* x,
+ [[maybe_unused]] int incx)
+{
+ OPM_THROW(std::runtime_error, "cublasScal multiplication for integer vectors is not implemented yet.");
+}
+inline cublasStatus_t
+cublasAxpy(cublasHandle_t handle,
+ int n,
+ const double* alpha, /* host or device pointer */
+ const double* x,
+ int incx,
+ double* y,
+ int incy)
+{
+ return cublasDaxpy(handle,
+ n,
+ alpha, /* host or device pointer */
+ x,
+ incx,
+ y,
+ incy);
+}
+
+inline cublasStatus_t
+cublasAxpy(cublasHandle_t handle,
+ int n,
+ const float* alpha, /* host or device pointer */
+ const float* x,
+ int incx,
+ float* y,
+ int incy)
+{
+ return cublasSaxpy(handle,
+ n,
+ alpha, /* host or device pointer */
+ x,
+ incx,
+ y,
+ incy);
+}
+
+inline cublasStatus_t
+cublasAxpy([[maybe_unused]] cublasHandle_t handle,
+ [[maybe_unused]] int n,
+ [[maybe_unused]] const int* alpha, /* host or device pointer */
+ [[maybe_unused]] const int* x,
+ [[maybe_unused]] int incx,
+ [[maybe_unused]] int* y,
+ [[maybe_unused]] int incy)
+{
+ OPM_THROW(std::runtime_error, "axpy multiplication for integer vectors is not implemented yet.");
+}
+
+inline cublasStatus_t
+cublasDot(cublasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result)
+{
+ return cublasDdot(handle, n, x, incx, y, incy, result);
+}
+
+inline cublasStatus_t
+cublasDot(cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result)
+{
+ return cublasSdot(handle, n, x, incx, y, incy, result);
+}
+
+inline cublasStatus_t
+cublasDot([[maybe_unused]] cublasHandle_t handle,
+ [[maybe_unused]] int n,
+ [[maybe_unused]] const int* x,
+ [[maybe_unused]] int incx,
+ [[maybe_unused]] const int* y,
+ [[maybe_unused]] int incy,
+ [[maybe_unused]] int* result)
+{
+ OPM_THROW(std::runtime_error, "inner product for integer vectors is not implemented yet.");
+}
+
+inline cublasStatus_t
+cublasNrm2(cublasHandle_t handle, int n, const double* x, int incx, double* result)
+{
+ return cublasDnrm2(handle, n, x, incx, result);
+}
+
+
+inline cublasStatus_t
+cublasNrm2(cublasHandle_t handle, int n, const float* x, int incx, float* result)
+{
+ return cublasSnrm2(handle, n, x, incx, result);
+}
+
+inline cublasStatus_t
+cublasNrm2([[maybe_unused]] cublasHandle_t handle,
+ [[maybe_unused]] int n,
+ [[maybe_unused]] const int* x,
+ [[maybe_unused]] int incx,
+ [[maybe_unused]] int* result)
+{
+ OPM_THROW(std::runtime_error, "norm2 for integer vectors is not implemented yet.");
+}
+
+} // namespace Opm::cuistl::detail
+#endif
diff --git a/opm/simulators/linalg/cuistl/detail/vector_operations.cu b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
new file mode 100644
index 000000000..5c7223870
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@@ -0,0 +1,112 @@
+/*
+ Copyright 2022-2023 SINTEF AS
+
+ This file is part of the Open Porous Media project (OPM).
+
+ OPM is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ OPM is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with OPM. If not, see .
+*/
+#include
+// TODO: [perf] Get rid of thrust.
+#include
+#include
+namespace Opm::cuistl::detail
+{
+
+namespace
+{
+ template
+ __global__ void setZeroAtIndexSetKernel(T* devicePointer, size_t numberOfElements, const int* indices)
+ {
+ const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
+
+ if (globalIndex < numberOfElements) {
+ devicePointer[indices[globalIndex]] = T(0);
+ }
+ }
+
+ template
+ __global__ void setVectorValueKernel(T* devicePointer, size_t numberOfElements, T value)
+ {
+ const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
+
+ if (globalIndex < numberOfElements) {
+ devicePointer[globalIndex] = value;
+ }
+ }
+
+ template
+ __global__ void
+ elementWiseMultiplyKernel(const T* a, const T* b, T* buffer, size_t numberOfElements, const int* indices)
+ {
+ const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
+
+ // TODO: [perf] Is it faster to just use a mask? Probably does not matter either way
+ // This is hopefully not where we will spend most of our time.
+ if (globalIndex < numberOfElements) {
+ buffer[indices[globalIndex]] = a[indices[globalIndex]] * b[indices[globalIndex]];
+ }
+ }
+
+ constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements)
+ {
+ return 1024;
+ }
+
+ inline size_t getBlocks(size_t numberOfElements)
+ {
+ const auto threads = getThreads(numberOfElements);
+ return (numberOfElements + threads - 1) / threads;
+ }
+} // namespace
+
+template
+void
+setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
+{
+ setVectorValueKernel<<>>(
+ deviceData, numberOfElements, value);
+}
+
+template void setVectorValue(double*, size_t, const double&);
+template void setVectorValue(float*, size_t, const float&);
+template void setVectorValue(int*, size_t, const int&);
+
+template
+void
+setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
+{
+ setZeroAtIndexSetKernel<<>>(
+ deviceData, numberOfElements, indices);
+}
+template void setZeroAtIndexSet(double*, size_t, const int*);
+template void setZeroAtIndexSet(float*, size_t, const int*);
+template void setZeroAtIndexSet(int*, size_t, const int*);
+
+template
+T
+innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
+{
+ elementWiseMultiplyKernel<<>>(
+ deviceA, deviceB, buffer, numberOfElements, indices);
+
+ // TODO: [perf] Get rid of thrust and use a more direct reduction here.
+ auto bufferAsDevicePointer = thrust::device_pointer_cast(buffer);
+ return thrust::reduce(bufferAsDevicePointer, bufferAsDevicePointer + numberOfElements);
+}
+
+template double innerProductAtIndices(const double*, const double*, double* buffer, size_t, const int*);
+template float innerProductAtIndices(const float*, const float*, float* buffer, size_t, const int*);
+template int innerProductAtIndices(const int*, const int*, int* buffer, size_t, const int*);
+
+} // namespace Opm::cuistl::impl
diff --git a/opm/simulators/linalg/cuistl/detail/vector_operations.hpp b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
new file mode 100644
index 000000000..15e425b1e
--- /dev/null
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
@@ -0,0 +1,58 @@
+/*
+ Copyright 2022-2023 SINTEF AS
+
+ This file is part of the Open Porous Media project (OPM).
+
+ OPM is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ OPM is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with OPM. If not, see .
+*/
+#ifndef OPM_CUISTL_VECTOR_OPERATIONS_HPP
+#define OPM_CUISTL_VECTOR_OPERATIONS_HPP
+#include
+namespace Opm::cuistl::detail
+{
+
+/**
+ * @brief setVectorValue sets every element of deviceData to value
+ * @param deviceData pointer to GPU memory
+ * @param numberOfElements number of elements to set to value
+ * @param value the value to use
+ */
+template
+void setVectorValue(T* deviceData, size_t numberOfElements, const T& value);
+
+/**
+ * @brief setZeroAtIndexSet sets deviceData to zero in the indices of contained in indices
+ * @param deviceData the data to operate on
+ * @param numberOfElements number of indices
+ * @param indices the indices to use
+ */
+template
+void setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices);
+
+/**
+ * @brief innerProductAtIndices computes the inner product between deviceA[indices] and deviceB[indices]
+ * @param deviceA data A
+ * @param deviceB data B
+ * @param buffer a buffer with number of elements equal to numberOfElements
+ * @param numberOfElements number of indices
+ * @param indices the indices to compute the inner product over
+ * @return the result of the inner product
+ *
+ * @note This is equivalent to projecting the vectors to the indices contained in indices, then doing the inner product
+ * of those indices.
+ */
+template
+T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices);
+} // namespace Opm::cuistl::detail
+#endif
diff --git a/tests/cuistl/test_cuvector.cpp b/tests/cuistl/test_cuvector.cpp
new file mode 100644
index 000000000..be516de2b
--- /dev/null
+++ b/tests/cuistl/test_cuvector.cpp
@@ -0,0 +1,284 @@
+/*
+ Copyright 2022-2023 SINTEF AS
+
+ This file is part of the Open Porous Media project (OPM).
+
+ OPM is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ OPM is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with OPM. If not, see .
+*/
+#include
+
+#define BOOST_TEST_MODULE TestCuVector
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+BOOST_AUTO_TEST_CASE(TestDocumentedUsage)
+{
+ auto someDataOnCPU = std::vector({1.0, 2.0, 42.0, 59.9451743, 10.7132692});
+
+ auto dataOnGPU = ::Opm::cuistl::CuVector(someDataOnCPU);
+
+ // Multiply by 4.0:
+ dataOnGPU *= 4.0;
+
+ // Get data back on CPU in another vector:
+ auto stdVectorOnCPU = dataOnGPU.asStdVector();
+
+ std::vector correctVector(someDataOnCPU.size());
+
+ std::transform(someDataOnCPU.begin(), someDataOnCPU.end(), correctVector.begin(), [](double x) { return 4 * x; });
+ BOOST_CHECK_EQUAL_COLLECTIONS(
+ stdVectorOnCPU.begin(), stdVectorOnCPU.end(), correctVector.begin(), correctVector.end());
+}
+
+BOOST_AUTO_TEST_CASE(TestConstructionSize)
+{
+ const int numberOfElements = 1234;
+ auto vectorOnGPU = Opm::cuistl::CuVector(numberOfElements);
+ BOOST_CHECK_EQUAL(numberOfElements, vectorOnGPU.dim());
+}
+
+BOOST_AUTO_TEST_CASE(TestCopyFromHostConstructor)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+ BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
+ std::vector buffer(data.size(), 0.0);
+ vectorOnGPU.copyToHost(buffer.data(), buffer.size());
+ BOOST_CHECK_EQUAL_COLLECTIONS(buffer.begin(), buffer.end(), data.begin(), data.end());
+}
+
+
+BOOST_AUTO_TEST_CASE(TestCopyFromHostFunction)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.size());
+ BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
+ vectorOnGPU.copyFromHost(data.data(), data.size());
+ std::vector buffer(data.size(), 0.0);
+ vectorOnGPU.copyToHost(buffer.data(), buffer.size());
+ BOOST_CHECK_EQUAL_COLLECTIONS(buffer.begin(), buffer.end(), data.begin(), data.end());
+}
+
+
+BOOST_AUTO_TEST_CASE(TestCopyFromBvector)
+{
+ auto blockVector = Dune::BlockVector> {{{42, 43}, {44, 45}, {46, 47}}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(blockVector.dim());
+ vectorOnGPU.copyFromHost(blockVector);
+ std::vector buffer(vectorOnGPU.dim());
+ vectorOnGPU.copyToHost(buffer.data(), buffer.size());
+
+ BOOST_CHECK_EQUAL_COLLECTIONS(
+ buffer.begin(), buffer.end(), &blockVector[0][0], &blockVector[0][0] + blockVector.dim());
+}
+
+BOOST_AUTO_TEST_CASE(TestCopyToBvector)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7, 8, 9}};
+ auto blockVector = Dune::BlockVector>(3);
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+ vectorOnGPU.copyToHost(blockVector);
+
+
+ BOOST_CHECK_EQUAL_COLLECTIONS(data.begin(), data.end(), &blockVector[0][0], &blockVector[0][0] + blockVector.dim());
+}
+
+BOOST_AUTO_TEST_CASE(TestDataPointer)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7, 8, 9}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+
+ std::vector buffer(data.size(), 0.0);
+ cudaMemcpy(buffer.data(), vectorOnGPU.data(), sizeof(double) * data.size(), cudaMemcpyDeviceToHost);
+ BOOST_CHECK_EQUAL_COLLECTIONS(data.begin(), data.end(), buffer.begin(), buffer.end());
+}
+
+BOOST_AUTO_TEST_CASE(TestCopyScalarMultiply)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+ BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim());
+ const double scalar = 42.25;
+ vectorOnGPU *= scalar;
+ std::vector buffer(data.size(), 0.0);
+ vectorOnGPU.copyToHost(buffer.data(), buffer.size());
+
+ for (size_t i = 0; i < buffer.size(); ++i) {
+ BOOST_CHECK_EQUAL(buffer[i], scalar * data[i]);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(TestTwoNorm)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+ auto twoNorm = vectorOnGPU.two_norm();
+
+ double correctAnswer = 0.0;
+ for (double d : data) {
+ correctAnswer += d * d;
+ }
+ correctAnswer = std::sqrt(correctAnswer);
+ BOOST_CHECK_EQUAL(correctAnswer, twoNorm);
+}
+
+BOOST_AUTO_TEST_CASE(TestDot)
+{
+ std::vector dataA {{1, 2, 3, 4, 5, 6, 7}};
+ std::vector dataB {{8, 9, 10, 11, 12, 13, 14}};
+ auto vectorOnGPUA = Opm::cuistl::CuVector(dataA.data(), dataA.size());
+ auto vectorOnGPUB = Opm::cuistl::CuVector(dataB.data(), dataB.size());
+ auto dot = vectorOnGPUA.dot(vectorOnGPUB);
+
+ double correctAnswer = 0.0;
+ for (size_t i = 0; i < dataA.size(); ++i) {
+ correctAnswer += dataA[i] * dataB[i];
+ }
+ correctAnswer = correctAnswer;
+ BOOST_CHECK_EQUAL(correctAnswer, dot);
+}
+
+BOOST_AUTO_TEST_CASE(Assigment)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+ vectorOnGPU = 10.0;
+ vectorOnGPU.copyToHost(data.data(), data.size());
+
+ for (double x : data) {
+ BOOST_CHECK_EQUAL(10.0, x);
+ }
+}
+
+
+BOOST_AUTO_TEST_CASE(CopyConstructor)
+{
+ std::vector data {{1, 2, 3, 4, 5, 6, 7}};
+ auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size());
+ vectorOnGPU.copyToHost(data.data(), data.size());
+ auto vectorOnGPUB = Opm::cuistl::CuVector(data.size());
+ vectorOnGPUB = 4.0;
+ vectorOnGPUB = vectorOnGPU;
+
+ std::vector output(data.size());
+ vectorOnGPUB.copyToHost(output.data(), output.size());
+ BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), data.begin(), data.end());
+}
+
+BOOST_AUTO_TEST_CASE(RandomVectors)
+{
+
+ using GVector = Opm::cuistl::CuVector;
+ std::srand(0);
+ std::mt19937 generator;
+ std::uniform_real_distribution distribution(-100.0, 100.0);
+ std::uniform_real_distribution distribution01(.0, 1.0);
+
+ const size_t N = 1000;
+
+ const size_t retries = 100;
+
+ for (size_t retry = 0; retry < retries; ++retry) {
+ std::vector a(N);
+ std::vector b(N);
+
+ for (size_t i = 0; i < N; ++i) {
+ a[i] = distribution(generator);
+ b[i] = distribution(generator);
+ }
+
+ auto aGPU = GVector(a);
+ auto bGPU = GVector(b);
+
+ aGPU += bGPU;
+
+ auto aOutputPlus = aGPU.asStdVector();
+
+ for (size_t i = 0; i < N; ++i) {
+ BOOST_CHECK_EQUAL(aOutputPlus[i], a[i] + b[i]);
+ }
+
+ aGPU = GVector(a);
+ aGPU -= bGPU;
+
+ auto aOutputMinus = aGPU.asStdVector();
+
+ for (size_t i = 0; i < N; ++i) {
+ BOOST_CHECK_EQUAL(aOutputMinus[i], a[i] - b[i]);
+ }
+
+
+ aGPU = GVector(a);
+ auto scalar = distribution(generator);
+ aGPU *= scalar;
+ auto aOutputScalar = aGPU.asStdVector();
+ for (size_t i = 0; i < N; ++i) {
+ BOOST_CHECK_EQUAL(aOutputScalar[i], scalar * a[i]);
+ }
+
+ aGPU = GVector(a);
+ aGPU.axpy(scalar, bGPU);
+ auto aOutputSaxypy = aGPU.asStdVector();
+ for (size_t i = 0; i < N; ++i) {
+ BOOST_CHECK_CLOSE(aOutputSaxypy[i], a[i] + scalar * b[i], 1e-7);
+ }
+
+ aGPU = GVector(a);
+ auto dotted = aGPU.dot(bGPU);
+ double correct = 0.0;
+ for (size_t i = 0; i < N; ++i) {
+ correct += a[i] * b[i];
+ }
+
+ BOOST_CHECK_CLOSE(dotted, correct, 1e-7);
+
+ aGPU = GVector(a);
+ auto twoNorm = aGPU.two_norm();
+ double correctTwoNorm = 0.0;
+ for (size_t i = 0; i < N; ++i) {
+ correctTwoNorm += a[i] * a[i];
+ }
+ correctTwoNorm = std::sqrt(correctTwoNorm);
+
+ BOOST_CHECK_CLOSE(twoNorm, correctTwoNorm, 1e-7);
+
+ aGPU = GVector(a);
+ std::vector indexSet;
+ const double rejectCriteria = 0.2;
+ for (size_t i = 0; i < N; ++i) {
+ const auto reject = distribution01(generator);
+ if (reject < rejectCriteria) {
+ indexSet.push_back(i);
+ }
+ }
+ auto indexSetGPU = Opm::cuistl::CuVector(indexSet);
+
+ aGPU.setZeroAtIndexSet(indexSetGPU);
+ auto projectedA = aGPU.asStdVector();
+ for (size_t i = 0; i < N; ++i) {
+ // Yeah, O(N^2) so sue me
+ bool found = std::find(indexSet.begin(), indexSet.end(), i) != indexSet.end();
+ if (found) {
+ BOOST_CHECK_EQUAL(projectedA[i], 0);
+ } else {
+ BOOST_CHECK_EQUAL(projectedA[i], a[i]);
+ }
+ }
+ }
+}