From 858d8b189b35205bd5ea750af6208b0b7a52002c Mon Sep 17 00:00:00 2001 From: Kjetil Olsen Lye Date: Tue, 28 Mar 2023 13:31:52 +0200 Subject: [PATCH] Added CuVector with tests. --- CMakeLists_files.cmake | 7 +- opm/simulators/linalg/cuistl/CuVector.cpp | 268 ++++++++++++++ opm/simulators/linalg/cuistl/CuVector.hpp | 335 ++++++++++++++++++ .../linalg/cuistl/detail/cublas_wrapper.hpp | 168 +++++++++ .../linalg/cuistl/detail/vector_operations.cu | 112 ++++++ .../cuistl/detail/vector_operations.hpp | 58 +++ tests/cuistl/test_cuvector.cpp | 284 +++++++++++++++ 7 files changed, 1230 insertions(+), 2 deletions(-) create mode 100644 opm/simulators/linalg/cuistl/CuVector.cpp create mode 100644 opm/simulators/linalg/cuistl/CuVector.hpp create mode 100644 opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp create mode 100644 opm/simulators/linalg/cuistl/detail/vector_operations.cu create mode 100644 opm/simulators/linalg/cuistl/detail/vector_operations.hpp create mode 100644 tests/cuistl/test_cuvector.cpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 24436dad2..2b531c1ae 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -147,6 +147,9 @@ if(CUDA_FOUND) # CUISTL SOURCE list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu) + # CUISTL HEADERS list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp) @@ -155,6 +158,7 @@ if(CUDA_FOUND) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp) + list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuVector.hpp) endif() if(OPENCL_FOUND) @@ -239,8 +243,7 @@ if(CUDA_FOUND) list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuda_check_last_error.cpp) list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cublas_handle.cpp) list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_handle.cpp) - - + list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuvector.cpp) endif() if(OPENCL_FOUND) list(APPEND TEST_SOURCE_FILES tests/test_openclSolver.cpp) diff --git a/opm/simulators/linalg/cuistl/CuVector.cpp b/opm/simulators/linalg/cuistl/CuVector.cpp new file mode 100644 index 000000000..609ab2f38 --- /dev/null +++ b/opm/simulators/linalg/cuistl/CuVector.cpp @@ -0,0 +1,268 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CHECKSIZE(x) \ + if (x.m_numberOfElements != m_numberOfElements) { \ + OPM_THROW(std::invalid_argument, \ + fmt::format("Given vector has {}, while we have {}.", x.m_numberOfElements, m_numberOfElements)); \ + } +#define CHECKPOSITIVESIZE \ + if (m_numberOfElements <= 0) { \ + OPM_THROW(std::invalid_argument, "We have 0 elements"); \ + } + +namespace Opm::cuistl +{ + +template +CuVector::CuVector(const std::vector& data) + : CuVector(data.data(), data.size()) +{ +} + +template +CuVector::CuVector(const int numberOfElements) + : m_numberOfElements(numberOfElements) + , m_cuBlasHandle(detail::CuBlasHandle::getInstance()) +{ + OPM_CUDA_SAFE_CALL(cudaMalloc(&m_dataOnDevice, sizeof(T) * m_numberOfElements)); +} + +template +CuVector::CuVector(const T* dataOnHost, const int numberOfElements) + : CuVector(numberOfElements) +{ + + OPM_CUDA_SAFE_CALL(cudaMemcpy(m_dataOnDevice, dataOnHost, m_numberOfElements * sizeof(T), cudaMemcpyHostToDevice)); +} + +template +CuVector& +CuVector::operator=(T scalar) +{ + CHECKPOSITIVESIZE + detail::setVectorValue(data(), m_numberOfElements, scalar); + return *this; +} + +template +CuVector& +CuVector::operator=(const CuVector& other) +{ + CHECKPOSITIVESIZE + CHECKSIZE(other) + if (other.m_numberOfElements != m_numberOfElements) { + OPM_THROW(std::invalid_argument, "Can only copy from vector of same size."); + } + OPM_CUDA_SAFE_CALL( + cudaMemcpy(m_dataOnDevice, other.m_dataOnDevice, m_numberOfElements * sizeof(T), cudaMemcpyDeviceToDevice)); + return *this; +} + +template +CuVector::CuVector(const CuVector& other) + : CuVector(other.m_numberOfElements) +{ + CHECKPOSITIVESIZE + CHECKSIZE(other) + OPM_CUDA_SAFE_CALL( + cudaMemcpy(m_dataOnDevice, other.m_dataOnDevice, m_numberOfElements * sizeof(T), cudaMemcpyDeviceToDevice)); +} + +template +CuVector::~CuVector() +{ + OPM_CUDA_SAFE_CALL(cudaFree(m_dataOnDevice)); +} + +template +const T* +CuVector::data() const +{ + return m_dataOnDevice; +} + +template +typename CuVector::size_type +CuVector::dim() const +{ + return m_numberOfElements; +} + +template +std::vector +CuVector::asStdVector() const +{ + std::vector temporary(m_numberOfElements); + copyToHost(temporary); + return temporary; +} + +template +void +CuVector::setZeroAtIndexSet(const CuVector& indexSet) +{ + detail::setZeroAtIndexSet(m_dataOnDevice, indexSet.dim(), indexSet.data()); +} + +template +T* +CuVector::data() +{ + return m_dataOnDevice; +} + +template +CuVector& +CuVector::operator*=(const T& scalar) +{ + CHECKPOSITIVESIZE + OPM_CUBLAS_SAFE_CALL(detail::cublasScal(m_cuBlasHandle.get(), m_numberOfElements, &scalar, data(), 1)); + return *this; +} + +template +CuVector& +CuVector::axpy(T alpha, const CuVector& y) +{ + CHECKPOSITIVESIZE + CHECKSIZE(y) + OPM_CUBLAS_SAFE_CALL(detail::cublasAxpy(m_cuBlasHandle.get(), m_numberOfElements, &alpha, y.data(), 1, data(), 1)); + return *this; +} + +template +T +CuVector::dot(const CuVector& other) const +{ + CHECKPOSITIVESIZE + CHECKSIZE(other) + T result = T(0); + OPM_CUBLAS_SAFE_CALL( + detail::cublasDot(m_cuBlasHandle.get(), m_numberOfElements, data(), 1, other.data(), 1, &result)); + return result; +} +template +T +CuVector::two_norm() const +{ + CHECKPOSITIVESIZE + T result = T(0); + OPM_CUBLAS_SAFE_CALL(detail::cublasNrm2(m_cuBlasHandle.get(), m_numberOfElements, data(), 1, &result)); + return result; +} + +template +T +CuVector::dot(const CuVector& other, const CuVector& indexSet, CuVector& buffer) const +{ + return detail::innerProductAtIndices(m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data()); +} + +template +T +CuVector::two_norm(const CuVector& indexSet, CuVector& buffer) const +{ + // TODO: [perf] Optimize this to a single call + return std::sqrt(this->dot(*this, indexSet, buffer)); +} + +template +T +CuVector::dot(const CuVector& other, const CuVector& indexSet) const +{ + CuVector buffer(indexSet.dim()); + return detail::innerProductAtIndices(m_dataOnDevice, other.data(), buffer.data(), indexSet.dim(), indexSet.data()); +} + +template +T +CuVector::two_norm(const CuVector& indexSet) const +{ + CuVector buffer(indexSet.dim()); + // TODO: [perf] Optimize this to a single call + return std::sqrt(this->dot(*this, indexSet, buffer)); +} +template +CuVector& +CuVector::operator+=(const CuVector& other) +{ + CHECKPOSITIVESIZE + CHECKSIZE(other) + // TODO: [perf] Make a specialized version of this + return axpy(1.0, other); +} + +template +CuVector& +CuVector::operator-=(const CuVector& other) +{ + CHECKPOSITIVESIZE + CHECKSIZE(other) + // TODO: [perf] Make a specialized version of this + return axpy(-1.0, other); +} + + +template +void +CuVector::copyFromHost(const T* dataPointer, int numberOfElements) +{ + if (numberOfElements > dim()) { + OPM_THROW(std::runtime_error, + fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.", + dim(), + numberOfElements)); + } + OPM_CUDA_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice)); +} + +template +void +CuVector::copyToHost(T* dataPointer, int numberOfElements) const +{ + OPM_CUDA_SAFE_CALL(cudaMemcpy(dataPointer, data(), numberOfElements * sizeof(T), cudaMemcpyDeviceToHost)); +} + +template +void +CuVector::copyFromHost(const std::vector& data) +{ + copyFromHost(data.data(), data.size()); +} +template +void +CuVector::copyToHost(std::vector& data) const +{ + copyToHost(data.data(), data.size()); +} +template class CuVector; +template class CuVector; +template class CuVector; + +} // namespace Opm::cuistl diff --git a/opm/simulators/linalg/cuistl/CuVector.hpp b/opm/simulators/linalg/cuistl/CuVector.hpp new file mode 100644 index 000000000..d7a3e7a60 --- /dev/null +++ b/opm/simulators/linalg/cuistl/CuVector.hpp @@ -0,0 +1,335 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_CUVECTOR_HEADER_HPP +#define OPM_CUVECTOR_HEADER_HPP +#include +#include +#include +#include +#include +#include +#include + +namespace Opm::cuistl +{ + +/** + * @brief The CuVector class is a simple (arithmetic) vector class for the GPU. + * + * @note we currently only support simple raw primitives for T (double and float) + * + * @note this vector has no notion of block size. The user is responsible for allocating + * the correct number of primitives (double or floats) + * Example usage: + * + * @code{.cpp} + * #include + * + * void someFunction() { + * auto someDataOnCPU = std::vector({1.0, 2.0, 42.0, 59.9451743, 10.7132692}); + * + * auto dataOnGPU = CuVector(someDataOnCPU); + * + * // Multiply by 4.0: + * dataOnGPU *= 4.0; + * + * // Get data back on CPU in another vector: + * auto stdVectorOnCPU = dataOnGPU.asStdVector(); + * } + * + * @tparam T the type to store. Can be either float, double or int. + */ +template +class CuVector +{ +public: + using field_type = T; + using size_type = size_t; + + + /** + * @brief CuVector allocates new GPU memory of the same size as other and copies the content of the other vector to + * this newly allocated memory. + * + * @note This does synchronous transfer. + * + * @param other the vector to copy from + */ + CuVector(const CuVector& other); + + /** + * @brief CuVector allocates new GPU memory of the same size as data and copies the content of the data vector to + * this newly allocated memory. + * + * @note This does CPU to GPU transfer. + * @note This does synchronous transfer. + * + * @param data the vector to copy from + */ + explicit CuVector(const std::vector& data); + + /** + * @brief operator= copies the content of the data vector to the memory of this vector. + * + * @note This requires the two vectors to be of the same size. + * @note This does synchronous transfer. + * + * @param other the vector to copy from + */ + CuVector& operator=(const CuVector& other); + + /** + * @brief operator= sets the whole vector equal to the scalar value. + * + * @note This does asynchronous operations + * + * @param scalar the value all elements will be set to. + */ + CuVector& operator=(T scalar); + + /** + * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T) + * + * @param numberOfElements number of T elements to allocate + */ + explicit CuVector(const int numberOfElements); + + + /** + * @brief CuVector allocates new GPU memory of size numberOfElements * sizeof(T) and copies numberOfElements from + * data + * + * @note This assumes the data is on the CPU. + * + * @param numberOfElements number of T elements to allocate + * @param dataOnHost data on host/CPU + */ + CuVector(const T* dataOnHost, const int numberOfElements); + + /** + * @brief ~CuVector calls cudaFree + */ + virtual ~CuVector(); + + /** + * @return the raw pointer to the GPU data + */ + T* data(); + + /** + * @return the raw pointer to the GPU data + */ + const T* data() const; + + /** + * @brief copyFromHost copies data from a Dune::BlockVector + * @param vector the vector to copy from + * + * @note This does synchronous transfer. + * @note This assumes that the size of this vector is equal to the dim of the input vector. + */ + template + void copyFromHost(const Dune::BlockVector>& vector) + { + if (m_numberOfElements != vector.dim()) { + OPM_THROW(std::runtime_error, + fmt::format("Given incompatible vector size. CuVector has size {}, \n" + "however, BlockVector has N() = {}, and dim = {}.", + m_numberOfElements, + vector.N(), + vector.dim())); + } + const auto dataPointer = static_cast(&(vector[0][0])); + copyFromHost(dataPointer, m_numberOfElements); + } + + /** + * @brief copyToHost copies data to a Dune::BlockVector + * @param vector the vector to copy to + * + * @note This does synchronous transfer. + * @note This assumes that the size of this vector is equal to the dim of the input vector. + */ + template + void copyToHost(Dune::BlockVector>& vector) const + { + if (m_numberOfElements != vector.dim()) { + OPM_THROW(std::runtime_error, + fmt::format("Given incompatible vector size. CuVector has size {},\n however, the BlockVector " + "has has N() = {}, and dim() = {}.", + m_numberOfElements, + vector.N(), + vector.dim())); + } + const auto dataPointer = static_cast(&(vector[0][0])); + copyToHost(dataPointer, m_numberOfElements); + } + + /** + * @brief copyFromHost copies numberOfElements from the CPU memory dataPointer + * @param dataPointer raw pointer to CPU memory + * @param numberOfElements number of elements to copy + * @note This does synchronous transfer. + * @note assumes that this vector has numberOfElements elements + */ + void copyFromHost(const T* dataPointer, int numberOfElements); + + /** + * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer + * @param dataPointer raw pointer to CPU memory + * @param numberOfElements number of elements to copy + * @note This does synchronous transfer. + * @note assumes that this vector has numberOfElements elements + */ + void copyToHost(T* dataPointer, int numberOfElements) const; + + /** + * @brief copyToHost copies data from an std::vector + * @param data the vector to copy from + * + * @note This does synchronous transfer. + * @note This assumes that the size of this vector is equal to the size of the input vector. + */ + void copyFromHost(const std::vector& data); + + /** + * @brief copyToHost copies data to an std::vector + * @param data the vector to copy to + * + * @note This does synchronous transfer. + * @note This assumes that the size of this vector is equal to the size of the input vector. + */ + void copyToHost(std::vector& data) const; + + /** + * @brief operator *= multiplies every element by scalar + * @param scalar the scalar to with which to multiply every element + * + * @note This operation is asynchronous. + */ + CuVector& operator*=(const T& scalar); + + /** + * @brief axpy sets this vector equal to this + alha * y + * @param alpha the scalar with which to multiply y + * @param y input vector of same size as this + * + * @note this will call CuBlas in the background + * @note int is not supported + */ + CuVector& axpy(T alpha, const CuVector& y); + + /** + * @brief operator+= adds the other vector to this vector + * + * @note this will call CuBlas in the background + * @note int is not supported + */ + CuVector& operator+=(const CuVector& other); + + /** + * @brief operator-= subtracts the other vector from this vector + * + * @note this will call CuBlas in the background + * @note int is not supported + */ + CuVector& operator-=(const CuVector& other); + + /** + * @brief dot computes the dot product (standard inner product) against the other vector + * @param other vector of same size as this + * @note this will call CuBlas in the background + * @note int is not supported + * + * @return the result on the inner product + */ + T dot(const CuVector& other) const; + + /** + * @brief returns the l2 norm of the vector + * @note this will call CuBlas in the background + * @note int is not supported + * + * @return the l2 norm + */ + T two_norm() const; + + //! Computes the dot product sum_i this[indexSet[i]] * other[indexSet[i]] + T dot(const CuVector& other, const CuVector& indexSet, CuVector& buffer) const; + //! Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]]) + T two_norm(const CuVector& indexSet, CuVector& buffer) const; + //! Computes the dot product sum_i this[indexSet[i]] * other[indexSet[i]] + T dot(const CuVector& other, const CuVector& indexSet) const; + //! Computes the norm sqrt(sum_i this[indexSet[i]] * this[indexSet[i]]) + T two_norm(const CuVector& indexSet) const; + + + /** + * @brief dim returns the dimension (number of T elements) in the vector + * @return number of elements + */ + size_type dim() const; + + + /** + * @brief creates an std::vector of the same size and copies the GPU data to this std::vector + * @return an std::vector containing the elements copied from the GPU. + */ + std::vector asStdVector() const; + + /** + * @brief creates an std::vector of the same size and copies the GPU data to this std::vector + * @return an std::vector containing the elements copied from the GPU. + */ + template + Dune::BlockVector> asDuneBlockVector() const + { + OPM_ERROR_IF(dim() % blockSize != 0, + fmt::format("blockSize is not a multiple of dim(). Given blockSize = {}, and dim() = {}", + blockSize, + dim())); + + Dune::BlockVector> returnValue(dim() / blockSize); + copyToHost(returnValue); + return returnValue; + } + + + /** + * @brief setZeroAtIndexSet for each element in indexSet, sets the index of this vector to be zero + * @param indexSet the set of indices to set to zero + * + * @note Assumes all indices are within range + * + * This is supposed to do the same as the following code on the CPU: + * @code{.cpp} + * for (int index : indexSet) { + * this->data[index] = T(0.0); + * } + * @endcode + */ + void setZeroAtIndexSet(const CuVector& indexSet); + +private: + T* m_dataOnDevice = nullptr; + const int m_numberOfElements; + detail::CuBlasHandle& m_cuBlasHandle; +}; +} // namespace Opm::cuistl +#endif diff --git a/opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp b/opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp new file mode 100644 index 000000000..e72bf5115 --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp @@ -0,0 +1,168 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +/** + * Contains wrappers to make the CuBLAS library behave as a modern C++ library with function overlading. + * + * In simple terms, this allows one to call say cublasScal on both double and single precisision, + * instead of calling cublasDscal and cublasSscal respectively. + */ + +#ifndef OPM_CUBLASWRAPPER_HEADER_INCLUDED +#define OPM_CUBLASWRAPPER_HEADER_INCLUDED +#include +#include + +namespace Opm::cuistl::detail +{ + +inline cublasStatus_t +cublasScal(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + double* x, + int incx) +{ + return cublasDscal(handle, + n, + alpha, /* host or device pointer */ + x, + incx); +} + +inline cublasStatus_t +cublasScal(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + float* x, + int incx) +{ + return cublasSscal(handle, + n, + alpha, /* host or device pointer */ + x, + incx); +} + +inline cublasStatus_t +cublasScal([[maybe_unused]] cublasHandle_t handle, + [[maybe_unused]] int n, + [[maybe_unused]] const int* alpha, /* host or device pointer */ + [[maybe_unused]] int* x, + [[maybe_unused]] int incx) +{ + OPM_THROW(std::runtime_error, "cublasScal multiplication for integer vectors is not implemented yet."); +} +inline cublasStatus_t +cublasAxpy(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* y, + int incy) +{ + return cublasDaxpy(handle, + n, + alpha, /* host or device pointer */ + x, + incx, + y, + incy); +} + +inline cublasStatus_t +cublasAxpy(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* y, + int incy) +{ + return cublasSaxpy(handle, + n, + alpha, /* host or device pointer */ + x, + incx, + y, + incy); +} + +inline cublasStatus_t +cublasAxpy([[maybe_unused]] cublasHandle_t handle, + [[maybe_unused]] int n, + [[maybe_unused]] const int* alpha, /* host or device pointer */ + [[maybe_unused]] const int* x, + [[maybe_unused]] int incx, + [[maybe_unused]] int* y, + [[maybe_unused]] int incy) +{ + OPM_THROW(std::runtime_error, "axpy multiplication for integer vectors is not implemented yet."); +} + +inline cublasStatus_t +cublasDot(cublasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result) +{ + return cublasDdot(handle, n, x, incx, y, incy, result); +} + +inline cublasStatus_t +cublasDot(cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result) +{ + return cublasSdot(handle, n, x, incx, y, incy, result); +} + +inline cublasStatus_t +cublasDot([[maybe_unused]] cublasHandle_t handle, + [[maybe_unused]] int n, + [[maybe_unused]] const int* x, + [[maybe_unused]] int incx, + [[maybe_unused]] const int* y, + [[maybe_unused]] int incy, + [[maybe_unused]] int* result) +{ + OPM_THROW(std::runtime_error, "inner product for integer vectors is not implemented yet."); +} + +inline cublasStatus_t +cublasNrm2(cublasHandle_t handle, int n, const double* x, int incx, double* result) +{ + return cublasDnrm2(handle, n, x, incx, result); +} + + +inline cublasStatus_t +cublasNrm2(cublasHandle_t handle, int n, const float* x, int incx, float* result) +{ + return cublasSnrm2(handle, n, x, incx, result); +} + +inline cublasStatus_t +cublasNrm2([[maybe_unused]] cublasHandle_t handle, + [[maybe_unused]] int n, + [[maybe_unused]] const int* x, + [[maybe_unused]] int incx, + [[maybe_unused]] int* result) +{ + OPM_THROW(std::runtime_error, "norm2 for integer vectors is not implemented yet."); +} + +} // namespace Opm::cuistl::detail +#endif diff --git a/opm/simulators/linalg/cuistl/detail/vector_operations.cu b/opm/simulators/linalg/cuistl/detail/vector_operations.cu new file mode 100644 index 000000000..5c7223870 --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu @@ -0,0 +1,112 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +// TODO: [perf] Get rid of thrust. +#include +#include +namespace Opm::cuistl::detail +{ + +namespace +{ + template + __global__ void setZeroAtIndexSetKernel(T* devicePointer, size_t numberOfElements, const int* indices) + { + const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x; + + if (globalIndex < numberOfElements) { + devicePointer[indices[globalIndex]] = T(0); + } + } + + template + __global__ void setVectorValueKernel(T* devicePointer, size_t numberOfElements, T value) + { + const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x; + + if (globalIndex < numberOfElements) { + devicePointer[globalIndex] = value; + } + } + + template + __global__ void + elementWiseMultiplyKernel(const T* a, const T* b, T* buffer, size_t numberOfElements, const int* indices) + { + const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x; + + // TODO: [perf] Is it faster to just use a mask? Probably does not matter either way + // This is hopefully not where we will spend most of our time. + if (globalIndex < numberOfElements) { + buffer[indices[globalIndex]] = a[indices[globalIndex]] * b[indices[globalIndex]]; + } + } + + constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements) + { + return 1024; + } + + inline size_t getBlocks(size_t numberOfElements) + { + const auto threads = getThreads(numberOfElements); + return (numberOfElements + threads - 1) / threads; + } +} // namespace + +template +void +setVectorValue(T* deviceData, size_t numberOfElements, const T& value) +{ + setVectorValueKernel<<>>( + deviceData, numberOfElements, value); +} + +template void setVectorValue(double*, size_t, const double&); +template void setVectorValue(float*, size_t, const float&); +template void setVectorValue(int*, size_t, const int&); + +template +void +setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices) +{ + setZeroAtIndexSetKernel<<>>( + deviceData, numberOfElements, indices); +} +template void setZeroAtIndexSet(double*, size_t, const int*); +template void setZeroAtIndexSet(float*, size_t, const int*); +template void setZeroAtIndexSet(int*, size_t, const int*); + +template +T +innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices) +{ + elementWiseMultiplyKernel<<>>( + deviceA, deviceB, buffer, numberOfElements, indices); + + // TODO: [perf] Get rid of thrust and use a more direct reduction here. + auto bufferAsDevicePointer = thrust::device_pointer_cast(buffer); + return thrust::reduce(bufferAsDevicePointer, bufferAsDevicePointer + numberOfElements); +} + +template double innerProductAtIndices(const double*, const double*, double* buffer, size_t, const int*); +template float innerProductAtIndices(const float*, const float*, float* buffer, size_t, const int*); +template int innerProductAtIndices(const int*, const int*, int* buffer, size_t, const int*); + +} // namespace Opm::cuistl::impl diff --git a/opm/simulators/linalg/cuistl/detail/vector_operations.hpp b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp new file mode 100644 index 000000000..15e425b1e --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp @@ -0,0 +1,58 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_CUISTL_VECTOR_OPERATIONS_HPP +#define OPM_CUISTL_VECTOR_OPERATIONS_HPP +#include +namespace Opm::cuistl::detail +{ + +/** + * @brief setVectorValue sets every element of deviceData to value + * @param deviceData pointer to GPU memory + * @param numberOfElements number of elements to set to value + * @param value the value to use + */ +template +void setVectorValue(T* deviceData, size_t numberOfElements, const T& value); + +/** + * @brief setZeroAtIndexSet sets deviceData to zero in the indices of contained in indices + * @param deviceData the data to operate on + * @param numberOfElements number of indices + * @param indices the indices to use + */ +template +void setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices); + +/** + * @brief innerProductAtIndices computes the inner product between deviceA[indices] and deviceB[indices] + * @param deviceA data A + * @param deviceB data B + * @param buffer a buffer with number of elements equal to numberOfElements + * @param numberOfElements number of indices + * @param indices the indices to compute the inner product over + * @return the result of the inner product + * + * @note This is equivalent to projecting the vectors to the indices contained in indices, then doing the inner product + * of those indices. + */ +template +T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices); +} // namespace Opm::cuistl::detail +#endif diff --git a/tests/cuistl/test_cuvector.cpp b/tests/cuistl/test_cuvector.cpp new file mode 100644 index 000000000..be516de2b --- /dev/null +++ b/tests/cuistl/test_cuvector.cpp @@ -0,0 +1,284 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include + +#define BOOST_TEST_MODULE TestCuVector + +#include +#include +#include +#include +#include +#include + +BOOST_AUTO_TEST_CASE(TestDocumentedUsage) +{ + auto someDataOnCPU = std::vector({1.0, 2.0, 42.0, 59.9451743, 10.7132692}); + + auto dataOnGPU = ::Opm::cuistl::CuVector(someDataOnCPU); + + // Multiply by 4.0: + dataOnGPU *= 4.0; + + // Get data back on CPU in another vector: + auto stdVectorOnCPU = dataOnGPU.asStdVector(); + + std::vector correctVector(someDataOnCPU.size()); + + std::transform(someDataOnCPU.begin(), someDataOnCPU.end(), correctVector.begin(), [](double x) { return 4 * x; }); + BOOST_CHECK_EQUAL_COLLECTIONS( + stdVectorOnCPU.begin(), stdVectorOnCPU.end(), correctVector.begin(), correctVector.end()); +} + +BOOST_AUTO_TEST_CASE(TestConstructionSize) +{ + const int numberOfElements = 1234; + auto vectorOnGPU = Opm::cuistl::CuVector(numberOfElements); + BOOST_CHECK_EQUAL(numberOfElements, vectorOnGPU.dim()); +} + +BOOST_AUTO_TEST_CASE(TestCopyFromHostConstructor) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim()); + std::vector buffer(data.size(), 0.0); + vectorOnGPU.copyToHost(buffer.data(), buffer.size()); + BOOST_CHECK_EQUAL_COLLECTIONS(buffer.begin(), buffer.end(), data.begin(), data.end()); +} + + +BOOST_AUTO_TEST_CASE(TestCopyFromHostFunction) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.size()); + BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim()); + vectorOnGPU.copyFromHost(data.data(), data.size()); + std::vector buffer(data.size(), 0.0); + vectorOnGPU.copyToHost(buffer.data(), buffer.size()); + BOOST_CHECK_EQUAL_COLLECTIONS(buffer.begin(), buffer.end(), data.begin(), data.end()); +} + + +BOOST_AUTO_TEST_CASE(TestCopyFromBvector) +{ + auto blockVector = Dune::BlockVector> {{{42, 43}, {44, 45}, {46, 47}}}; + auto vectorOnGPU = Opm::cuistl::CuVector(blockVector.dim()); + vectorOnGPU.copyFromHost(blockVector); + std::vector buffer(vectorOnGPU.dim()); + vectorOnGPU.copyToHost(buffer.data(), buffer.size()); + + BOOST_CHECK_EQUAL_COLLECTIONS( + buffer.begin(), buffer.end(), &blockVector[0][0], &blockVector[0][0] + blockVector.dim()); +} + +BOOST_AUTO_TEST_CASE(TestCopyToBvector) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7, 8, 9}}; + auto blockVector = Dune::BlockVector>(3); + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + vectorOnGPU.copyToHost(blockVector); + + + BOOST_CHECK_EQUAL_COLLECTIONS(data.begin(), data.end(), &blockVector[0][0], &blockVector[0][0] + blockVector.dim()); +} + +BOOST_AUTO_TEST_CASE(TestDataPointer) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7, 8, 9}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + + std::vector buffer(data.size(), 0.0); + cudaMemcpy(buffer.data(), vectorOnGPU.data(), sizeof(double) * data.size(), cudaMemcpyDeviceToHost); + BOOST_CHECK_EQUAL_COLLECTIONS(data.begin(), data.end(), buffer.begin(), buffer.end()); +} + +BOOST_AUTO_TEST_CASE(TestCopyScalarMultiply) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + BOOST_CHECK_EQUAL(data.size(), vectorOnGPU.dim()); + const double scalar = 42.25; + vectorOnGPU *= scalar; + std::vector buffer(data.size(), 0.0); + vectorOnGPU.copyToHost(buffer.data(), buffer.size()); + + for (size_t i = 0; i < buffer.size(); ++i) { + BOOST_CHECK_EQUAL(buffer[i], scalar * data[i]); + } +} + +BOOST_AUTO_TEST_CASE(TestTwoNorm) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + auto twoNorm = vectorOnGPU.two_norm(); + + double correctAnswer = 0.0; + for (double d : data) { + correctAnswer += d * d; + } + correctAnswer = std::sqrt(correctAnswer); + BOOST_CHECK_EQUAL(correctAnswer, twoNorm); +} + +BOOST_AUTO_TEST_CASE(TestDot) +{ + std::vector dataA {{1, 2, 3, 4, 5, 6, 7}}; + std::vector dataB {{8, 9, 10, 11, 12, 13, 14}}; + auto vectorOnGPUA = Opm::cuistl::CuVector(dataA.data(), dataA.size()); + auto vectorOnGPUB = Opm::cuistl::CuVector(dataB.data(), dataB.size()); + auto dot = vectorOnGPUA.dot(vectorOnGPUB); + + double correctAnswer = 0.0; + for (size_t i = 0; i < dataA.size(); ++i) { + correctAnswer += dataA[i] * dataB[i]; + } + correctAnswer = correctAnswer; + BOOST_CHECK_EQUAL(correctAnswer, dot); +} + +BOOST_AUTO_TEST_CASE(Assigment) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + vectorOnGPU = 10.0; + vectorOnGPU.copyToHost(data.data(), data.size()); + + for (double x : data) { + BOOST_CHECK_EQUAL(10.0, x); + } +} + + +BOOST_AUTO_TEST_CASE(CopyConstructor) +{ + std::vector data {{1, 2, 3, 4, 5, 6, 7}}; + auto vectorOnGPU = Opm::cuistl::CuVector(data.data(), data.size()); + vectorOnGPU.copyToHost(data.data(), data.size()); + auto vectorOnGPUB = Opm::cuistl::CuVector(data.size()); + vectorOnGPUB = 4.0; + vectorOnGPUB = vectorOnGPU; + + std::vector output(data.size()); + vectorOnGPUB.copyToHost(output.data(), output.size()); + BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), data.begin(), data.end()); +} + +BOOST_AUTO_TEST_CASE(RandomVectors) +{ + + using GVector = Opm::cuistl::CuVector; + std::srand(0); + std::mt19937 generator; + std::uniform_real_distribution distribution(-100.0, 100.0); + std::uniform_real_distribution distribution01(.0, 1.0); + + const size_t N = 1000; + + const size_t retries = 100; + + for (size_t retry = 0; retry < retries; ++retry) { + std::vector a(N); + std::vector b(N); + + for (size_t i = 0; i < N; ++i) { + a[i] = distribution(generator); + b[i] = distribution(generator); + } + + auto aGPU = GVector(a); + auto bGPU = GVector(b); + + aGPU += bGPU; + + auto aOutputPlus = aGPU.asStdVector(); + + for (size_t i = 0; i < N; ++i) { + BOOST_CHECK_EQUAL(aOutputPlus[i], a[i] + b[i]); + } + + aGPU = GVector(a); + aGPU -= bGPU; + + auto aOutputMinus = aGPU.asStdVector(); + + for (size_t i = 0; i < N; ++i) { + BOOST_CHECK_EQUAL(aOutputMinus[i], a[i] - b[i]); + } + + + aGPU = GVector(a); + auto scalar = distribution(generator); + aGPU *= scalar; + auto aOutputScalar = aGPU.asStdVector(); + for (size_t i = 0; i < N; ++i) { + BOOST_CHECK_EQUAL(aOutputScalar[i], scalar * a[i]); + } + + aGPU = GVector(a); + aGPU.axpy(scalar, bGPU); + auto aOutputSaxypy = aGPU.asStdVector(); + for (size_t i = 0; i < N; ++i) { + BOOST_CHECK_CLOSE(aOutputSaxypy[i], a[i] + scalar * b[i], 1e-7); + } + + aGPU = GVector(a); + auto dotted = aGPU.dot(bGPU); + double correct = 0.0; + for (size_t i = 0; i < N; ++i) { + correct += a[i] * b[i]; + } + + BOOST_CHECK_CLOSE(dotted, correct, 1e-7); + + aGPU = GVector(a); + auto twoNorm = aGPU.two_norm(); + double correctTwoNorm = 0.0; + for (size_t i = 0; i < N; ++i) { + correctTwoNorm += a[i] * a[i]; + } + correctTwoNorm = std::sqrt(correctTwoNorm); + + BOOST_CHECK_CLOSE(twoNorm, correctTwoNorm, 1e-7); + + aGPU = GVector(a); + std::vector indexSet; + const double rejectCriteria = 0.2; + for (size_t i = 0; i < N; ++i) { + const auto reject = distribution01(generator); + if (reject < rejectCriteria) { + indexSet.push_back(i); + } + } + auto indexSetGPU = Opm::cuistl::CuVector(indexSet); + + aGPU.setZeroAtIndexSet(indexSetGPU); + auto projectedA = aGPU.asStdVector(); + for (size_t i = 0; i < N; ++i) { + // Yeah, O(N^2) so sue me + bool found = std::find(indexSet.begin(), indexSet.end(), i) != indexSet.end(); + if (found) { + BOOST_CHECK_EQUAL(projectedA[i], 0); + } else { + BOOST_CHECK_EQUAL(projectedA[i], a[i]); + } + } + } +}