From d1e5a694766848c45377df4885e67b96a7b0ba5a Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 16 Oct 2024 15:37:16 +0200 Subject: [PATCH] Add new MP scheme to GPU ILU and DILU --- CMakeLists_files.cmake | 1 + .../linalg/PreconditionerFactory_impl.hpp | 20 ++-- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 68 ++++++++--- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 7 +- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 56 +++++++-- opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 8 +- .../linalg/gpuistl/detail/kernel_enums.hpp | 109 ++++++++++++++++++ .../preconditionerKernels/DILUKernels.cu | 102 ++++++++-------- .../preconditionerKernels/DILUKernels.hpp | 11 +- .../preconditionerKernels/ILU0Kernels.cu | 96 ++++++++------- .../preconditionerKernels/ILU0Kernels.hpp | 7 +- 11 files changed, 340 insertions(+), 145 deletions(-) create mode 100644 opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index e91d5729a..8978979d0 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -254,6 +254,7 @@ if (HAVE_CUDA) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu) + ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/kernel_enums.hpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu) diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index f5e3f2fdb..58e863928 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -357,10 +357,10 @@ struct StandardPreconditioners { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using GpuDILU = typename gpuistl::GpuDILU, gpuistl::GpuVector>; - auto gpuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); + auto gpuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme); auto adapted = std::make_shared>(gpuDILU); auto wrapped = std::make_shared>(adapted, comm); @@ -370,10 +370,10 @@ struct StandardPreconditioners { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using OpmGpuILU0 = typename gpuistl::OpmGpuILU0, gpuistl::GpuVector>; - auto gpuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); + auto gpuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme); auto adapted = std::make_shared>(gpuilu0); auto wrapped = std::make_shared>(adapted, comm); @@ -650,27 +650,27 @@ struct StandardPreconditioners { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using GPUILU0 = typename gpuistl::OpmGpuILU0, gpuistl::GpuVector>; - return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme)); }); F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using GPUDILU = typename gpuistl::GpuDILU, gpuistl::GpuVector>; - return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme)); }); F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using block_type = typename V::block_type; using VTo = Dune::BlockVector>; @@ -679,7 +679,7 @@ struct StandardPreconditioners { using Adapter = typename gpuistl::PreconditionerAdapter; using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter; auto converted = std::make_shared(op.getmat()); - auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); + auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, mixed_precision_scheme)); converted->setUnderlyingPreconditioner(adapted); return converted; }); diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index 2097f1cc7..d311386b0 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -41,7 +41,7 @@ namespace Opm::gpuistl { template -GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) +GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) @@ -52,8 +52,7 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_splitMatrix(splitMatrix) , m_tuneThreadBlockSizes(tuneKernels) - , m_storeFactorizationAsFloat(storeFactorizationAsFloat) - + , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check @@ -82,14 +81,16 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo m_cpuMatrix, m_reorderedToNatural); } - if (m_storeFactorizationAsFloat) { + if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG) { if (!m_splitMatrix){ OPM_THROW(std::runtime_error, "Matrix must be split when storing as float."); } m_gpuMatrixReorderedLowerFloat = std::make_unique(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); - m_gpuMatrixReorderedDiagFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); - m_gpuDInvFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); + + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) { + m_gpuDInvFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); + } } computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); @@ -123,8 +124,8 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat) { - detail::DILU::solveLowerLevelSetSplit( + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) { + detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), @@ -135,8 +136,20 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int d.data(), v.data(), lowerSolveThreadBlockSize); + } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { + detail::DILU::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), + m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + d.data(), + v.data(), + lowerSolveThreadBlockSize); } else { - detail::DILU::solveLowerLevelSetSplit( + detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -170,7 +183,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - if (m_storeFactorizationAsFloat){ + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG){ detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), @@ -181,6 +194,17 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInvFloat->data(), v.data(), upperSolveThreadBlockSize); + } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ + detail::DILU::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), + m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + v.data(), + upperSolveThreadBlockSize); } else { detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), @@ -271,8 +295,8 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat) { - detail::DILU::computeDiluDiagonalSplit( + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) { + detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -289,9 +313,27 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), factorizationBlockSize); + } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { + detail::DILU::computeDiluDiagonalSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag->data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + nullptr, + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + factorizationBlockSize); } else { // TODO: should this be field type twice or field type then float in the template? - detail::DILU::computeDiluDiagonalSplit( + detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index 623e15132..241eef3e9 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -62,7 +63,7 @@ public: //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); + explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. @@ -144,8 +145,8 @@ private: bool m_splitMatrix; //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards bool m_tuneThreadBlockSizes; - //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision - bool m_storeFactorizationAsFloat; + //! \brief Enum describing how we should store the factorized matrix + const MatrixStorageMPScheme m_mixedPrecisionScheme; //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! The default value of -1 indicates that we have not calibrated and selected a value yet int m_upperSolveThreadBlockSize = -1; diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 5bd663aed..686b8f09f 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -41,7 +41,7 @@ namespace Opm::gpuistl { template -OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) +OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) @@ -56,8 +56,9 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_splitMatrix(splitMatrix) , m_tuneThreadBlockSizes(tuneKernels) - , m_storeFactorizationAsFloat(storeFactorizationAsFloat) + , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme)) { + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -85,13 +86,16 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel m_cpuMatrix, m_reorderedToNatural); } - if (m_storeFactorizationAsFloat){ + if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG){ OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true"); // initialize mixed precision datastructures m_gpuMatrixReorderedLowerFloat = std::make_unique(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); - m_gpuMatrixReorderedDiagFloat.emplace(GpuVector(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); + // The MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG does not need to allocate this float vector + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) { + m_gpuMatrixReorderedDiagFloat.emplace(GpuVector(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); + } } LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); @@ -128,7 +132,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat){ + if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG) { detail::ILU0::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), @@ -171,8 +175,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - if (m_storeFactorizationAsFloat) { - detail::ILU0::solveUpperLevelSetSplit( + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) { + detail::ILU0::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), @@ -183,8 +187,20 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i v.data(), upperSolveThreadBlockSize); } + else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) { + detail::ILU0::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), + m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuMatrixReorderedDiag.value().data(), + v.data(), + upperSolveThreadBlockSize); + } else{ - detail::ILU0::solveUpperLevelSetSplit( + detail::ILU0::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(), @@ -272,8 +288,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat){ - detail::ILU0::LUFactorizationSplit( + if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG){ + detail::ILU0::LUFactorizationSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -290,8 +306,26 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact numOfRowsInLevel, factorizationThreadBlockSize); } + else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){ + detail::ILU0::LUFactorizationSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag.value().data(), + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + nullptr, + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + factorizationThreadBlockSize); + } else{ - detail::ILU0::LUFactorizationSplit( + detail::ILU0::LUFactorizationSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index 025be1f5b..c163e0483 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -64,7 +65,7 @@ public: //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); + explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. @@ -143,9 +144,8 @@ private: bool m_splitMatrix; //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards bool m_tuneThreadBlockSizes; - //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure. - //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed. - bool m_storeFactorizationAsFloat; + //! \brief Enum storing how we should store the factorized matrix + const MatrixStorageMPScheme m_mixedPrecisionScheme; //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! The default value of -1 indicates that we have not calibrated and selected a value yet int m_upperSolveThreadBlockSize = -1; diff --git a/opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp b/opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp new file mode 100644 index 000000000..9c9bc1931 --- /dev/null +++ b/opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp @@ -0,0 +1,109 @@ +/* + Copyright 2024 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_GPUISTL_KERNEL_ENUMS_HPP +#define OPM_GPUISTL_KERNEL_ENUMS_HPP + +#include + +/* + This file organizes a growing amount of different mixed precision options for the preconditioners. +*/ + +namespace Opm::gpuistl { + // Mixed precision schemes used for storing the matrix in GPU memory + enum class MatrixStorageMPScheme { + DOUBLE_DIAG_DOUBLE_OFFDIAG = 0, // full precision should be default + FLOAT_DIAG_FLOAT_OFFDIAG = 1, + DOUBLE_DIAG_FLOAT_OFFDIAG = 2 + }; + + namespace detail { + bool isValidMatrixStorageMPScheme(int scheme); + } + + inline MatrixStorageMPScheme makeMatrixStorageMPScheme(int scheme) { + if (!detail::isValidMatrixStorageMPScheme(scheme)) { + OPM_THROW(std::invalid_argument, + fmt::format("Invalid matrix storage mixed precision scheme chosen: {}.\n" + "Valid Schemes:\n" + "\t0: DOUBLE_DIAG_DOUBLE_OFFDIAG\n" + "\t1: FLOAT_DIAG_FLOAT_OFFDIAG\n" + "\t2: DOUBLE_DIAG_FLOAT_OFFDIAG", + scheme)); + } + return static_cast(scheme); + } + + namespace detail { + + __host__ __device__ constexpr bool storeDiagonalAsFloat(MatrixStorageMPScheme scheme) { + switch (scheme) { + case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG: + return false; + case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG: + return true; + case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG: + return false; + default: + return false; + } + } + + __host__ __device__ constexpr bool storeOffDiagonalAsFloat(MatrixStorageMPScheme scheme) { + switch (scheme) { + case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG: + return false; + case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG: + return true; + case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG: + return true; + default: + return false; + } + } + + // returns true if we use anything else that the the default double precision for everything + __host__ __device__ constexpr bool usingMixedPrecision(MatrixStorageMPScheme scheme) { + switch (scheme) { + case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG: + return false; + case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG: + return true; + case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG: + return true; + default: + return false; + } + } + + inline bool isValidMatrixStorageMPScheme(int scheme) { + switch (static_cast(scheme)) { + case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG: + case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG: + case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG: + return true; + default: + return false; + } + } + } +} + +#endif // OPM_GPUISTL_KERNEL_ENUMS_HPP diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index 050de2989..b8f9f48ca 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -59,14 +59,14 @@ namespace } } - template + template __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v) { @@ -88,7 +88,7 @@ namespace mmvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -118,14 +118,14 @@ namespace } } - template + template __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v) { const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); @@ -140,7 +140,7 @@ namespace umvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -213,7 +213,7 @@ namespace // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results // TOOD: The important part is to only cast after that is fully computed - template + template __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, @@ -255,7 +255,7 @@ namespace const int symOppositeBlock = symOppositeIdx; - if constexpr (copyResultToOtherMatrix) { + if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) { // TODO: think long and hard about whether this performs only the wanted memory transfers moveBlock(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]); moveBlock(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]); @@ -268,14 +268,9 @@ namespace } invBlockInPlace(dInvTmp); - - // for (int i = 0; i < blocksize; ++i) { - // for (int j = 0; j < blocksize; ++j) { - // dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; - // } - // } moveBlock(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]); - if constexpr (copyResultToOtherMatrix) { + + if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) { moveBlock(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! } } @@ -304,7 +299,7 @@ solveLowerLevelSet(T* reorderedMat, } -template +template void solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, @@ -312,15 +307,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveLowerLevelSetSplit, thrBlockSize); + cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( + cuSolveLowerLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } // perform the upper solve for all rows in the same level set @@ -343,7 +338,7 @@ solveUpperLevelSet(T* reorderedMat, reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, @@ -351,14 +346,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveUpperLevelSetSplit, thrBlockSize); + cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( + cuSolveUpperLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -391,7 +386,7 @@ computeDiluDiagonal(T* reorderedMat, } } -template +template void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, @@ -412,9 +407,9 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuComputeDiluDiagonalSplit, thrBlockSize); + cuComputeDiluDiagonalSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, + cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, lowerRowIndices, lowerColIndices, srcReorderedUpperMat, @@ -437,13 +432,17 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, // TODO: format #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); @@ -461,29 +460,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 6); -#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar) \ - template void solveUpperLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int); \ - template void solveLowerLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \ + template void solveUpperLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \ + template void solveLowerLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float); +// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double); + +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6); } // namespace Opm::gpuistl::detail::DILU diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp index 44b94c2f1..b7141834f 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace Opm::gpuistl::detail::DILU @@ -71,14 +72,14 @@ void solveLowerLevelSet(T* reorderedMat, * @param d Stores the defect * @param [out] v Will store the results of the lower solve */ -template +template void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, int threadBlockSize); @@ -124,14 +125,14 @@ void solveUpperLevelSet(T* reorderedMat, * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower * solve */ -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int threadBlockSize); @@ -183,7 +184,7 @@ void computeDiluDiagonal(T* reorderedMat, * function * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner */ -template +template void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index 5d304e366..46183efd2 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -120,7 +120,7 @@ namespace } } - template + template __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, @@ -161,7 +161,7 @@ namespace mmOverlap(&srcReorderedLowerMat[ij * scalarsInBlock], &srcDiagonal[j * scalarsInBlock], &srcReorderedLowerMat[ij * scalarsInBlock]); - if (copyResultToOtherMatrix) { + if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) { moveBlock(&srcReorderedLowerMat[ij * scalarsInBlock], &dstReorderedLowerMat[ij * scalarsInBlock]); } @@ -180,22 +180,15 @@ namespace while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) { InputScalar* ikBlockPtr; - OutputScalar* dstIkBlockPtr; int ikColumn; if (ikState == POSITION_TYPE::UNDER_DIAG) { ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock]; - if (copyResultToOtherMatrix) - dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock]; ikColumn = lowerColIndices[ik]; } else if (ikState == POSITION_TYPE::ON_DIAG) { ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock]; - if (copyResultToOtherMatrix) - dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock]; ikColumn = naturalIdx; } else { // ikState == POSITION_TYPE::ABOVE_DIAG ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock]; - if (copyResultToOtherMatrix) - dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock]; ikColumn = upperColIndices[ik]; } @@ -219,10 +212,15 @@ namespace } } invBlockInPlace(&srcDiagonal[reorderedIdx * scalarsInBlock]); - if (copyResultToOtherMatrix) { - moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], - &dstDiagonal[reorderedIdx * scalarsInBlock]); - + // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats, + // and sometimes also the diagonal. + if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) { + // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float + // if not then we just use the double diagonal that is already now stored in srcDiagonal + if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){ + moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], + &dstDiagonal[reorderedIdx * scalarsInBlock]); + } // also move all values above the diagonal on this row for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) { moveBlock(&srcReorderedUpperMat[block * scalarsInBlock], @@ -321,14 +319,14 @@ namespace } } - template + template __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v) { auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); @@ -348,7 +346,7 @@ namespace &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mvMixedGeneral( + mvMixedGeneral( &dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -410,7 +408,7 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, @@ -418,14 +416,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveUpperLevelSetSplit, thrBlockSize); + cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( + cuSolveUpperLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -447,7 +445,7 @@ LUFactorization(T* srcMatrix, srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); } -template +template void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, @@ -466,9 +464,9 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuLUFactorizationSplit, thrBlockSize); + cuLUFactorizationSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuLUFactorizationSplit + cuLUFactorizationSplit <<>>(srcReorderedLowerMat, lowerRowIndices, lowerColIndices, @@ -489,27 +487,29 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int); \ template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); -INSTANTIATE_KERNEL_WRAPPERS(float, 1); -INSTANTIATE_KERNEL_WRAPPERS(float, 2); -INSTANTIATE_KERNEL_WRAPPERS(float, 3); -INSTANTIATE_KERNEL_WRAPPERS(float, 4); -INSTANTIATE_KERNEL_WRAPPERS(float, 5); -INSTANTIATE_KERNEL_WRAPPERS(float, 6); -INSTANTIATE_KERNEL_WRAPPERS(double, 1); -INSTANTIATE_KERNEL_WRAPPERS(double, 2); -INSTANTIATE_KERNEL_WRAPPERS(double, 3); -INSTANTIATE_KERNEL_WRAPPERS(double, 4); -INSTANTIATE_KERNEL_WRAPPERS(double, 5); -INSTANTIATE_KERNEL_WRAPPERS(double, 6); +#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ + INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 2); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 3); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 4); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 5); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 6); + +INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float) +INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double) #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ /* double preconditioner */ \ @@ -523,15 +523,25 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6); float*, int*, int*, int*, int, int, const float*, float*, int); \ \ /* double preconditioner */ \ - template void solveUpperLevelSetSplit( \ + template void solveUpperLevelSetSplit( \ double*, int*, int*, int*, int, int, const double*, double*, int); \ /* float matrix, double compute preconditioner */ \ - template void solveUpperLevelSetSplit( \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, double*, int); \ + /* float preconditioner */ \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, float*, int); \ + /* double preconditioner */ \ + template void solveUpperLevelSetSplit( \ + double*, int*, int*, int*, int, int, const float*, double*, int); \ + /* float matrix, double compute preconditioner */ \ + template void solveUpperLevelSetSplit( \ float*, int*, int*, int*, int, int, const float*, double*, int); \ /* float preconditioner */ \ - template void solveUpperLevelSetSplit( \ + template void solveUpperLevelSetSplit( \ float*, int*, int*, int*, int, int, const float*, float*, int); + INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp index 03dbbe0c9..cdd78ec91 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -20,6 +20,7 @@ #define OPM_ILU0_KERNELS_HPP #include #include +#include namespace Opm::gpuistl::detail::ILU0 { @@ -89,14 +90,14 @@ void solveLowerLevelSet(T* reorderedMat, * solve * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int threadBlockSize); @@ -176,7 +177,7 @@ void LUFactorization(T* reorderedMat, * function * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template +template void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices,