From 55c20dbddd7497aa6286a8420923c37f5cb94547 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Mon, 30 Sep 2024 12:03:01 +0200 Subject: [PATCH] Implement mixed precision GPU ILU0 --- .../linalg/PreconditionerFactory_impl.hpp | 8 +- .../linalg/gpuistl/GpuSparseMatrix.cpp | 15 ++ .../linalg/gpuistl/GpuSparseMatrix.hpp | 14 ++ opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 92 +++++++--- opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 21 ++- .../gpuistl/detail/deviceBlockOperations.hpp | 52 +++++- .../preconditionerKernels/ILU0Kernels.cu | 163 ++++++++++++------ .../preconditionerKernels/ILU0Kernels.hpp | 29 ++-- 8 files changed, 291 insertions(+), 103 deletions(-) diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index 0f368cd80..2f58a5309 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -361,9 +361,10 @@ struct StandardPreconditioners { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); using field_type = typename V::field_type; using OpmGpuILU0 = typename gpuistl::OpmGpuILU0, gpuistl::GpuVector>; - auto gpuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels); + auto gpuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); auto adapted = std::make_shared>(gpuilu0); auto wrapped = std::make_shared>(adapted, comm); @@ -619,10 +620,12 @@ struct StandardPreconditioners { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + using field_type = typename V::field_type; using GPUILU0 = typename gpuistl::OpmGpuILU0, gpuistl::GpuVector>; - return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels)); + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); }); F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { @@ -636,6 +639,7 @@ struct StandardPreconditioners { F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + using block_type = typename V::block_type; using VTo = Dune::BlockVector>; using matrix_type_to = typename Dune::BCRSMatrix>; diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp index 3928784ca..19687b6cc 100644 --- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp +++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp @@ -81,6 +81,21 @@ GpuSparseMatrix::GpuSparseMatrix(const T* nonZeroElements, } } +template +GpuSparseMatrix::GpuSparseMatrix(const GpuVector rowIndices, + const GpuVector columnIndices, + size_t blockSize) + : m_nonZeroElements(columnIndices.dim() * blockSize * blockSize) + , m_columnIndices(columnIndices) + , m_rowIndices(rowIndices) + , m_numberOfNonzeroBlocks(detail::to_int(columnIndices.dim())) + , m_numberOfRows(detail::to_int(rowIndices.dim()-1)) + , m_blockSize(detail::to_int(blockSize)) + , m_matrixDescription(detail::createMatrixDescription()) + , m_cusparseHandle(detail::CuSparseHandle::getInstance()) +{ +} + template GpuSparseMatrix::~GpuSparseMatrix() { diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp index d3d6dcb66..2a2141a0f 100644 --- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp +++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp @@ -67,6 +67,20 @@ public: size_t blockSize, size_t numberOfRows); + //! Create a sparse matrix by copying the sparsity structure of another matrix, not filling in the values + //! + //! \note Prefer to use the constructor taking a const reference to a matrix instead. + //! + //! \param[in] rowIndices the row indices of the non-zero elements + //! \param[in] columnIndices the column indices of the non-zero elements + //! \param[in] blockSize size of each block matrix (typically 3) + //! + //! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to + //! restrictions in the current version of cusparse. This might change in future versions. + GpuSparseMatrix(const GpuVector rowIndices, + const GpuVector columnIndices, + size_t blockSize); + /** * We don't want to be able to copy this for now (too much hassle in copying the cusparse resources) */ diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 6bd008864..3782ff49b 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -41,7 +41,7 @@ namespace Opm::gpuistl { template -OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels) +OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) @@ -49,11 +49,14 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel , m_gpuMatrix(GpuSparseMatrix::fromMatrix(m_cpuMatrix, true)) , m_gpuMatrixReorderedLower(nullptr) , m_gpuMatrixReorderedUpper(nullptr) + , m_gpuMatrixReorderedLowerFloat(nullptr) + , m_gpuMatrixReorderedUpperFloat(nullptr) , m_gpuNaturalToReorder(m_naturalToReordered) , m_gpuReorderToNatural(m_reorderedToNatural) , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_splitMatrix(splitMatrix) , m_tuneThreadBlockSizes(tuneKernels) + , m_storeFactorizationAsFloat(storeFactorizationAsFloat) { // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check @@ -71,6 +74,7 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ", m_gpuMatrix.nonzeroes(), A.nonzeroes())); + if (m_splitMatrix) { m_gpuMatrixReorderedDiag.emplace(GpuVector(blocksize_ * blocksize_ * m_cpuMatrix.N())); std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper) @@ -80,6 +84,15 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel m_gpuReorderedLU = detail::createReorderedMatrix>( m_cpuMatrix, m_reorderedToNatural); } + + if (m_storeFactorizationAsFloat){ + assert(m_splitMatrix && "Mixed precision GpuILU0 is currently only supported when using split_matrix=true"); + // initialize mixed precision datastructures + m_gpuMatrixReorderedLowerFloat = std::unique_ptr(new auto(FloatMat(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_))); + m_gpuMatrixReorderedUpperFloat = std::unique_ptr(new auto(FloatMat(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_))); + m_gpuMatrixReorderedDiagFloat.emplace(GpuVector(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); + } + LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); if (m_tuneThreadBlockSizes) { @@ -107,20 +120,37 @@ template void OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize) { + // perform a lower solve and then an upper solve to apply the approximate inverse using ILU factorization + // for the lower and upper solve we have some if's that determine which underlying implementation to use + int levelStartIdx = 0; for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - detail::ILU0::solveLowerLevelSetSplit( - m_gpuMatrixReorderedLower->getNonZeroValues().data(), - m_gpuMatrixReorderedLower->getRowIndices().data(), - m_gpuMatrixReorderedLower->getColumnIndices().data(), - m_gpuReorderToNatural.data(), - levelStartIdx, - numOfRowsInLevel, - d.data(), - v.data(), - lowerSolveThreadBlockSize); + if (m_storeFactorizationAsFloat){ + detail::ILU0::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), + m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + d.data(), + v.data(), + lowerSolveThreadBlockSize); + } + else{ + detail::ILU0::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + d.data(), + v.data(), + lowerSolveThreadBlockSize); + } } else { detail::ILU0::solveLowerLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), m_gpuReorderedLU->getRowIndices().data(), @@ -140,16 +170,30 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - detail::ILU0::solveUpperLevelSetSplit( - m_gpuMatrixReorderedUpper->getNonZeroValues().data(), - m_gpuMatrixReorderedUpper->getRowIndices().data(), - m_gpuMatrixReorderedUpper->getColumnIndices().data(), - m_gpuReorderToNatural.data(), - levelStartIdx, - numOfRowsInLevel, - m_gpuMatrixReorderedDiag.value().data(), - v.data(), - upperSolveThreadBlockSize); + if (m_storeFactorizationAsFloat) { + detail::ILU0::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), + m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuMatrixReorderedDiagFloat.value().data(), + v.data(), + upperSolveThreadBlockSize); + } + else{ + detail::ILU0::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuMatrixReorderedDiag.value().data(), + v.data(), + upperSolveThreadBlockSize); + } } else { detail::ILU0::solveUpperLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), m_gpuReorderedLU->getRowIndices().data(), @@ -227,7 +271,7 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - detail::ILU0::LUFactorizationSplit( + detail::ILU0::LUFactorizationSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -235,10 +279,14 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(), m_gpuMatrixReorderedDiag.value().data(), + m_storeFactorizationAsFloat ? m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data() : nullptr, + m_storeFactorizationAsFloat ? m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data() : nullptr, + m_storeFactorizationAsFloat ? m_gpuMatrixReorderedDiagFloat.value().data() : nullptr, m_gpuReorderToNatural.data(), m_gpuNaturalToReorder.data(), levelStartIdx, numOfRowsInLevel, + m_storeFactorizationAsFloat, factorizationThreadBlockSize); } else { diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index ec29fb566..025be1f5b 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -54,7 +54,9 @@ public: //! \brief The field type of the preconditioner. using field_type = typename X::field_type; //! \brief The GPU matrix type - using CuMat = GpuSparseMatrix; + using GpuMat = GpuSparseMatrix; + //! \brief The Float matrix type for mixed precision + using FloatMat = GpuSparseMatrix; //! \brief Constructor. //! @@ -62,7 +64,7 @@ public: //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels); + explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. @@ -120,11 +122,15 @@ private: //! \brief converts from index in natural ordered structure to index reordered strucutre std::vector m_naturalToReordered; //! \brief The A matrix stored on the gpu, and its reordred version - CuMat m_gpuMatrix; - std::unique_ptr m_gpuReorderedLU; + GpuMat m_gpuMatrix; + std::unique_ptr m_gpuReorderedLU; //! \brief If matrix splitting is enabled, then we store the lower and upper part separately - std::unique_ptr m_gpuMatrixReorderedLower; - std::unique_ptr m_gpuMatrixReorderedUpper; + std::unique_ptr m_gpuMatrixReorderedLower; + std::unique_ptr m_gpuMatrixReorderedUpper; + //! \brief If mixed precision is enabled, store a float matrix + std::unique_ptr m_gpuMatrixReorderedLowerFloat; + std::unique_ptr m_gpuMatrixReorderedUpperFloat; + std::optional> m_gpuMatrixReorderedDiagFloat; //! \brief If matrix splitting is enabled, we also store the diagonal separately std::optional> m_gpuMatrixReorderedDiag; //! row conversion from natural to reordered matrix indices stored on the GPU @@ -137,6 +143,9 @@ private: bool m_splitMatrix; //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards bool m_tuneThreadBlockSizes; + //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure. + //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed. + bool m_storeFactorizationAsFloat; //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! The default value of -1 indicates that we have not calibrated and selected a value yet int m_upperSolveThreadBlockSize = -1; diff --git a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp index b3475591a..d590ca365 100644 --- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp +++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp @@ -37,9 +37,9 @@ __device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock) { if (blocksize == 1) { - dstBlock[0] = 1.0 / (srcBlock[0]); + dstBlock[0] = T(1.0) / (srcBlock[0]); } else if (blocksize == 2) { - T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]); + T detInv = T(1.0) / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]); dstBlock[0] = srcBlock[3] * detInv; dstBlock[1] = -srcBlock[1] * detInv; dstBlock[2] = -srcBlock[2] * detInv; @@ -53,7 +53,7 @@ invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock) T t12 = srcBlock[1] * srcBlock[6]; T t14 = srcBlock[2] * srcBlock[6]; - T t17 = 1.0 + T t17 = T(1.0) / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5] - t14 * srcBlock[4]); // t17 is 1/determinant @@ -75,9 +75,9 @@ __device__ __forceinline__ void invBlockInPlace(T* __restrict__ block) { if (blocksize == 1) { - block[0] = 1.0 / (block[0]); + block[0] = T(1.0) / (block[0]); } else if (blocksize == 2) { - T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]); + T detInv = T(1.0) / (block[0] * block[3] - block[1] * block[2]); T temp = block[0]; block[0] = block[3] * detInv; @@ -93,7 +93,7 @@ invBlockInPlace(T* __restrict__ block) T t14 = block[2] * block[6]; T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]); - T t17 = T(1.0) / det; + T t17 = T(T(1.0)) / det; T matrix01 = block[1]; T matrix00 = block[0]; @@ -229,6 +229,46 @@ matrixSubtraction(T* A, T* B) } } } + +// TODO: possibly place somewhere else in this file +// TODO: consider merging with existing block operations +template +__device__ __forceinline__ void +moveBlock(const ScalarInputType* __restrict__ A, ScalarOutputType* __restrict__ B){ + for (int i = 0; i < blocksize; ++i){ + for (int j = 0; j < blocksize; ++j){ + B[i * blocksize + j] = ScalarOutputType(A[i * blocksize + j]); + } + } +} + +// TODO: possibly place somewhere else in this file +// TODO: consider merging with existing block operations +template +__device__ __forceinline__ void +mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c) +{ + for (int i = 0; i < blocksize; ++i) { + c[i] = 0; + + for (int j = 0; j < blocksize; ++j) { + c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j])); + } + } +} + +// TODO: possibly place somewhere else in this file +// TODO: consider merging with existing block operations +template +__device__ __forceinline__ void +mmvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c) +{ + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + c[i] -= ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j])); + } + } +} } // namespace #endif diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index e58fbb18e..8d1dd642f 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -120,17 +120,21 @@ namespace } } - template - __global__ void cuLUFactorizationSplit(T* reorderedLowerMat, + template + __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, - T* reorderedUpperMat, + InputScalar* srcReorderedUpperMat, int* upperRowIndices, int* upperColIndices, - T* diagonal, + InputScalar* srcDiagonal, + OutputScalar* dstReorderedLowerMat, + OutputScalar* dstReorderedUpperMat, + OutputScalar* dstDiagonal, int* reorderedToNatural, int* naturalToReordered, const int startIdx, + const bool copyResultToOtherMatrix, int rowsInLevelSet) { auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; @@ -155,9 +159,12 @@ namespace // the DUNE implementation is inplace, and I need to take care to make sure // this out of place implementation is equivalent - mmOverlap(&reorderedLowerMat[ij * scalarsInBlock], - &diagonal[j * scalarsInBlock], - &reorderedLowerMat[ij * scalarsInBlock]); + mmOverlap(&srcReorderedLowerMat[ij * scalarsInBlock], + &srcDiagonal[j * scalarsInBlock], + &srcReorderedLowerMat[ij * scalarsInBlock]); + if (copyResultToOtherMatrix){ + moveBlock(&srcReorderedLowerMat[ij * scalarsInBlock], &dstReorderedLowerMat[ij * scalarsInBlock]); + } // we have now accounted for an element under the diagonal and the diagonal element above it // now iterate over the blocks that align in the @@ -172,40 +179,53 @@ namespace // the same block index might be found in the lower part of the matrix while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) { - T* ikBlockPtr; + InputScalar* ikBlockPtr; + OutputScalar* dstIkBlockPtr; int ikColumn; if (ikState == POSITION_TYPE::UNDER_DIAG) { - ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock]; + ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock]; + if (copyResultToOtherMatrix) + dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock]; ikColumn = lowerColIndices[ik]; } else if (ikState == POSITION_TYPE::ON_DIAG) { - ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock]; + ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock]; + if (copyResultToOtherMatrix) + dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock]; ikColumn = naturalIdx; } else { // ikState == POSITION_TYPE::ABOVE_DIAG - ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock]; + ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock]; + if (copyResultToOtherMatrix) + dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock]; ikColumn = upperColIndices[ik]; } if (ikColumn == upperColIndices[jk]) { // A_jk = A_ij * A_jk - T tmp[scalarsInBlock] = {0}; + InputScalar tmp[scalarsInBlock] = {0}; - mmNoOverlap( - &reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp); - matrixSubtraction(ikBlockPtr, tmp); + mmNoOverlap( + &srcReorderedLowerMat[ij * scalarsInBlock], &srcReorderedUpperMat[jk * scalarsInBlock], tmp); + matrixSubtraction(ikBlockPtr, tmp); incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper); - ; ++jk; } else { if (ikColumn < upperColIndices[jk]) { incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper); - ; } else { ++jk; } } } } - invBlockInPlace(&diagonal[reorderedIdx * scalarsInBlock]); + invBlockInPlace(&srcDiagonal[reorderedIdx * scalarsInBlock]); + if (copyResultToOtherMatrix){ + moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], &dstDiagonal[reorderedIdx * scalarsInBlock]); + + // also move all values above the diagonal on this row + for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) { + moveBlock(&srcReorderedUpperMat[block * scalarsInBlock], &dstReorderedUpperMat[block * scalarsInBlock]); + } + } } } @@ -266,15 +286,15 @@ namespace } } - template - __global__ void cuSolveLowerLevelSetSplit(T* mat, + template + __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* d, - T* v) + const LinearSolverScalar* d, + LinearSolverScalar* v) { auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); if (reorderedIdx < rowsInLevelSet + startIdx) { @@ -282,30 +302,31 @@ namespace const size_t nnzIdxLim = rowIndices[reorderedIdx + 1]; const int naturalRowIdx = indexConversion[reorderedIdx]; - T rhs[blocksize]; + LinearSolverScalar rhs[blocksize]; for (int i = 0; i < blocksize; i++) { rhs[i] = d[naturalRowIdx * blocksize + i]; } for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + mmvMixedGeneral( + &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } for (int i = 0; i < blocksize; ++i) { - v[naturalRowIdx * blocksize + i] = rhs[i]; + v[naturalRowIdx * blocksize + i] = LinearSolverScalar(rhs[i]); } } } - template - __global__ void cuSolveUpperLevelSetSplit(T* mat, + template + __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - T* v) + const MatrixScalar* dInv, + LinearSolverScalar* v) { auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); if (reorderedIdx < rowsInLevelSet + startIdx) { @@ -313,17 +334,17 @@ namespace const size_t nnzIdxLim = rowIndices[reorderedIdx + 1]; const int naturalRowIdx = indexConversion[reorderedIdx]; - T rhs[blocksize]; + LinearSolverScalar rhs[blocksize]; for (int i = 0; i < blocksize; i++) { rhs[i] = v[naturalRowIdx * blocksize + i]; } for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + mmvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mv(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mvMixedGeneral(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } } // namespace @@ -365,41 +386,41 @@ solveUpperLevelSet(T* reorderedMat, reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v); } -template +template void -solveLowerLevelSetSplit(T* reorderedMat, +solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* d, - T* v, + const LinearSolverScalar* d, + LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveLowerLevelSetSplit, thrBlockSize); + cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( + cuSolveLowerLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set -template +template void -solveUpperLevelSetSplit(T* reorderedMat, +solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - T* v, + const MatrixScalar* dInv, + LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveUpperLevelSetSplit, thrBlockSize); + cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( + cuSolveUpperLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -421,34 +442,42 @@ LUFactorization(T* srcMatrix, srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); } -template +template void -LUFactorizationSplit(T* reorderedLowerMat, +LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, - T* reorderedUpperMat, + InputScalar* reorderedUpperMat, int* upperRowIndices, int* upperColIndices, - T* diagonal, + InputScalar* srcDiagonal, + OutputScalar* dstReorderedLowerMat, + OutputScalar* dstReorderedUpperMat, + OutputScalar* dstDiagonal, int* reorderedToNatural, int* naturalToReordered, const int startIdx, int rowsInLevelSet, + bool copyResultToOtherMatrix, int thrBlockSize) { int threadBlockSize - = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit, thrBlockSize); + = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuLUFactorizationSplit<<>>(reorderedLowerMat, + cuLUFactorizationSplit<<>>(srcReorderedLowerMat, lowerRowIndices, lowerColIndices, reorderedUpperMat, upperRowIndices, upperColIndices, - diagonal, + srcDiagonal, + dstReorderedLowerMat, + dstReorderedUpperMat, + dstDiagonal, reorderedToNatural, naturalToReordered, startIdx, + copyResultToOtherMatrix, rowsInLevelSet); } @@ -456,10 +485,12 @@ LUFactorizationSplit(T* reorderedLowerMat, template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int); \ template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int); \ - template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, bool, int); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, bool, int); + // template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); \ + // template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); @@ -473,4 +504,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3); INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 6); + +#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ + /* double preconditioner */\ + template void solveLowerLevelSetSplit(double*, int*, int*, int*, int, int, const double*, double*, int); \ + /* float matrix, double compute preconditioner */\ + template void solveLowerLevelSetSplit(float*, int*, int*, int*, int, int, const double*, double*, int); \ + /* float preconditioner */\ + template void solveLowerLevelSetSplit(float*, int*, int*, int*, int, int, const float*, float*, int); \ + \ + /* double preconditioner */\ + template void solveUpperLevelSetSplit(double*, int*, int*, int*, int, int, const double*, double*, int); \ + /* float matrix, double compute preconditioner */\ + template void solveUpperLevelSetSplit(float*, int*, int*, int*, int, int, const float*, double*, int); \ + /* float preconditioner */\ + template void solveUpperLevelSetSplit(float*, int*, int*, int*, int, int, const float*, float*, int); \ + +INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); +INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2); +INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3); +INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(4); +INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(5); +INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(6); } // namespace Opm::gpuistl::detail::ILU0 diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp index 99e59ba13..42e5c2fbb 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -89,15 +89,15 @@ void solveLowerLevelSet(T* reorderedMat, * solve * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template -void solveUpperLevelSetSplit(T* reorderedMat, +template +void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - T* v, + const MatrixScalar* dInv, + LinearSolverScalar* v, int threadBlockSize); /** @@ -117,15 +117,15 @@ void solveUpperLevelSetSplit(T* reorderedMat, * solve * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template -void solveLowerLevelSetSplit(T* reorderedLowerMat, +template +void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* d, - T* v, + const LinearSolverScalar* d, + LinearSolverScalar* v, int threadBlockSize); /** @@ -155,6 +155,7 @@ void LUFactorization(T* reorderedMat, int threadBlockSize); /** + * TODO: update this doucmentation * @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper * triangular part) * @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse @@ -175,18 +176,22 @@ void LUFactorization(T* reorderedMat, * function * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template -void LUFactorizationSplit(T* reorderedLowerMat, +template +void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, - T* reorderedUpperMat, + InputScalar* srcReorderedUpperMat, int* upperRowIndices, int* upperColIndices, - T* diagonal, + InputScalar* srcDiagonal, + OutputScalar* dstReorderedLowerMat, + OutputScalar* dstReorderedUpperMat, + OutputScalar* dstDiagonal, int* reorderedToNatural, int* naturalToReordered, int startIdx, int rowsInLevelSet, + bool copyResultToOtherMatrix, int threadBlockSize); } // namespace Opm::gpuistl::detail::ILU0