diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index e1a7a817c..8cfb50530 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -205,16 +205,21 @@ if (HAVE_CUDA) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp) + ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu) + ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu) + ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp) + ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg OpmCuILU0.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp) # HEADERS + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp) @@ -223,7 +228,11 @@ if (HAVE_CUDA) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp) + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.hpp) + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.hpp) + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp) + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp) @@ -238,6 +247,8 @@ if (HAVE_CUDA) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/vector_operations.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/has_function.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditioner_should_call_post_pre.hpp) + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/deviceBlockOperations.hpp) + ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpuThreadUtils.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp) ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp) diff --git a/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp b/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp index 79c650766..82aa313a8 100644 --- a/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp +++ b/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp @@ -24,6 +24,7 @@ #if USE_HIP #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #else #include #include +#include #include #include #include diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index c3a28e386..50d6713c8 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -349,7 +349,6 @@ struct StandardPreconditioners { F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - // const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); using field_type = typename V::field_type; using CuDILU = typename cuistl::CuDILU, cuistl::CuVector>; auto cuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels); @@ -358,6 +357,18 @@ struct StandardPreconditioners { auto wrapped = std::make_shared>(adapted, comm); return wrapped; }); + + F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { + const bool split_matrix = prm.get("split_matrix", true); + const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + using field_type = typename V::field_type; + using OpmCuILU0 = typename cuistl::OpmCuILU0, cuistl::CuVector>; + auto cuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels); + + auto adapted = std::make_shared>(cuilu0); + auto wrapped = std::make_shared>(adapted, comm); + return wrapped; + }); #endif } @@ -605,6 +616,15 @@ struct StandardPreconditioners { std::make_shared(op.getmat(), w)); }); + F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { + const bool split_matrix = prm.get("split_matrix", true); + const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + using field_type = typename V::field_type; + using CUILU0 = typename cuistl::OpmCuILU0, cuistl::CuVector>; + + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels)); + }); + F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); diff --git a/opm/simulators/linalg/cuistl/CuDILU.cpp b/opm/simulators/linalg/cuistl/CuDILU.cpp index 9c6664afd..4626386bb 100644 --- a/opm/simulators/linalg/cuistl/CuDILU.cpp +++ b/opm/simulators/linalg/cuistl/CuDILU.cpp @@ -16,110 +16,23 @@ You should have received a copy of the GNU General Public License along with OPM. If not, see . */ -#include -#include +#include +#include #include #include #include +#include #include +#include +#include #include #include #include -#include +#include #include -#include +#include #include -#include -#include -#include -#include #include - - -namespace -{ -std::vector -createReorderedToNatural(Opm::SparseTable& levelSets) -{ - auto res = std::vector(Opm::cuistl::detail::to_size_t(levelSets.dataSize())); - int globCnt = 0; - for (auto row : levelSets) { - for (auto col : row) { - OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(), - fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size())); - res[globCnt++] = static_cast(col); - } - } - return res; -} - -std::vector -createNaturalToReordered(Opm::SparseTable& levelSets) -{ - auto res = std::vector(Opm::cuistl::detail::to_size_t(levelSets.dataSize())); - int globCnt = 0; - for (auto row : levelSets) { - for (auto col : row) { - OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(), - fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size())); - res[col] = globCnt++; - } - } - return res; -} - -template -void -createReorderedMatrix(const M& naturalMatrix, - std::vector& reorderedToNatural, - std::unique_ptr& reorderedGpuMat) -{ - M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise); - for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) { - auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()]; - // For elements in A - for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) { - dstRowIt.insert(elem.index()); - } - } - - reorderedGpuMat.reset(new auto (GPUM::fromMatrix(reorderedMatrix, true))); -} - -template -void -extractLowerAndUpperMatrices(const M& naturalMatrix, - std::vector& reorderedToNatural, - std::unique_ptr& lower, - std::unique_ptr& upper) -{ - const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2; - - M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise); - M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise); - - for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin(); - lowerIt != reorderedLower.createend(); - ++lowerIt, ++upperIt) { - - auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()]; - - for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) { - if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal - lowerIt.insert(elem.index()); - } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal - upperIt.insert(elem.index()); - } - } - } - - lower.reset(new auto (GPUM::fromMatrix(reorderedLower, true))); - upper.reset(new auto (GPUM::fromMatrix(reorderedUpper, true))); - return; -} - -} // NAMESPACE - namespace Opm::cuistl { @@ -127,8 +40,8 @@ template CuDILU::CuDILU(const M& A, bool splitMatrix, bool tuneKernels) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) - , m_reorderedToNatural(createReorderedToNatural(m_levelSets)) - , m_naturalToReordered(createNaturalToReordered(m_levelSets)) + , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) + , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets)) , m_gpuMatrix(CuSparseMatrix::fromMatrix(m_cpuMatrix, true)) , m_gpuNaturalToReorder(m_naturalToReordered) , m_gpuReorderToNatural(m_reorderedToNatural) @@ -154,19 +67,19 @@ CuDILU::CuDILU(const M& A, bool splitMatrix, bool tuneKernels) m_gpuMatrix.nonzeroes(), A.nonzeroes())); if (m_splitMatrix) { - m_gpuMatrixReorderedDiag.reset(new auto(CuVector(blocksize_ * blocksize_ * m_cpuMatrix.N()))); - extractLowerAndUpperMatrices>( - m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper); - } else { - createReorderedMatrix>( - m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered); + m_gpuMatrixReorderedDiag = std::make_unique>(blocksize_ * blocksize_ * m_cpuMatrix.N()); + std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper) + = detail::extractLowerAndUpperMatrices>(m_cpuMatrix, + m_reorderedToNatural); + m_gpuMatrixReordered = detail::createReorderedMatrix>( + m_cpuMatrix, m_reorderedToNatural); } computeDiagAndMoveReorderedData(); // HIP does currently not support automtically picking thread block sizes as well as CUDA // So only when tuning and using hip should we do our own manual tuning #ifdef USE_HIP - if (m_tuneThreadBlockSizes){ + if (m_tuneThreadBlockSizes) { tuneThreadBlockSizes(); } #endif @@ -188,7 +101,7 @@ CuDILU::apply(X& v, const Y& d) for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - detail::computeLowerSolveLevelSetSplit( + detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -200,7 +113,7 @@ CuDILU::apply(X& v, const Y& d) v.data(), m_applyThreadBlockSize); } else { - detail::computeLowerSolveLevelSet( + detail::DILU::solveLowerLevelSet( m_gpuMatrixReordered->getNonZeroValues().data(), m_gpuMatrixReordered->getRowIndices().data(), m_gpuMatrixReordered->getColumnIndices().data(), @@ -221,7 +134,7 @@ CuDILU::apply(X& v, const Y& d) const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - detail::computeUpperSolveLevelSetSplit( + detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(), @@ -232,7 +145,7 @@ CuDILU::apply(X& v, const Y& d) v.data(), m_applyThreadBlockSize); } else { - detail::computeUpperSolveLevelSet( + detail::DILU::solveUpperLevelSet( m_gpuMatrixReordered->getNonZeroValues().data(), m_gpuMatrixReordered->getRowIndices().data(), m_gpuMatrixReordered->getColumnIndices().data(), @@ -304,7 +217,7 @@ CuDILU::computeDiagAndMoveReorderedData() for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - detail::computeDiluDiagonalSplit( + detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -319,15 +232,16 @@ CuDILU::computeDiagAndMoveReorderedData() m_gpuDInv.data(), m_updateThreadBlockSize); } else { - detail::computeDiluDiagonal(m_gpuMatrixReordered->getNonZeroValues().data(), - m_gpuMatrixReordered->getRowIndices().data(), - m_gpuMatrixReordered->getColumnIndices().data(), - m_gpuReorderToNatural.data(), - m_gpuNaturalToReorder.data(), - levelStartIdx, - numOfRowsInLevel, - m_gpuDInv.data(), - m_updateThreadBlockSize); + detail::DILU::computeDiluDiagonal( + m_gpuMatrixReordered->getNonZeroValues().data(), + m_gpuMatrixReordered->getRowIndices().data(), + m_gpuMatrixReordered->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + m_updateThreadBlockSize); } levelStartIdx += numOfRowsInLevel; } @@ -345,23 +259,24 @@ CuDILU::tuneThreadBlockSizes() int bestUpdateBlockSize = -1; int interval = 64; - //temporary buffers for the apply + // temporary buffers for the apply CuVector tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize()); CuVector tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize()); tmpD = 1; - for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){ + for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) { // sometimes the first kernel launch kan be slower, so take the time twice - for (int i = 0; i < 2; ++i){ + for (int i = 0; i < 2; ++i) { auto beforeUpdate = std::chrono::high_resolution_clock::now(); m_updateThreadBlockSize = thrBlockSize; update(); std::ignore = cudaDeviceSynchronize(); auto afterUpdate = std::chrono::high_resolution_clock::now(); - if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid - long long durationInMicroSec = std::chrono::duration_cast(afterUpdate - beforeUpdate).count(); - if (durationInMicroSec < bestUpdateTime){ + if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid + long long durationInMicroSec + = std::chrono::duration_cast(afterUpdate - beforeUpdate).count(); + if (durationInMicroSec < bestUpdateTime) { bestUpdateTime = durationInMicroSec; bestUpdateBlockSize = thrBlockSize; } @@ -372,9 +287,10 @@ CuDILU::tuneThreadBlockSizes() apply(tmpV, tmpD); std::ignore = cudaDeviceSynchronize(); auto afterApply = std::chrono::high_resolution_clock::now(); - if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid - long long durationInMicroSec = std::chrono::duration_cast(afterApply - beforeApply).count(); - if (durationInMicroSec < bestApplyTime){ + if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid + long long durationInMicroSec + = std::chrono::duration_cast(afterApply - beforeApply).count(); + if (durationInMicroSec < bestApplyTime) { bestApplyTime = durationInMicroSec; bestApplyBlockSize = thrBlockSize; } diff --git a/opm/simulators/linalg/cuistl/CuDILU.hpp b/opm/simulators/linalg/cuistl/CuDILU.hpp index a449293b8..80968b371 100644 --- a/opm/simulators/linalg/cuistl/CuDILU.hpp +++ b/opm/simulators/linalg/cuistl/CuDILU.hpp @@ -19,17 +19,11 @@ #ifndef OPM_CUDILU_HPP #define OPM_CUDILU_HPP -#include +#include #include -#include #include #include -#include -#include -#include -#include #include -#include diff --git a/opm/simulators/linalg/cuistl/CuJac.cpp b/opm/simulators/linalg/cuistl/CuJac.cpp index 1a67640a0..be07bc58b 100644 --- a/opm/simulators/linalg/cuistl/CuJac.cpp +++ b/opm/simulators/linalg/cuistl/CuJac.cpp @@ -16,27 +16,13 @@ You should have received a copy of the GNU General Public License along with OPM. If not, see . */ -#include -#include -#include #include -#include #include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include @@ -67,11 +53,7 @@ CuJac::CuJac(const M& A, field_type w) A.nonzeroes())); // Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened - detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(), - m_gpuMatrix.getRowIndices().data(), - m_gpuMatrix.getColumnIndices().data(), - m_gpuMatrix.N(), - m_diagInvFlattened.data()); + invertDiagonalAndFlatten(); } template @@ -111,11 +93,19 @@ void CuJac::update() { m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); - detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(), - m_gpuMatrix.getRowIndices().data(), - m_gpuMatrix.getColumnIndices().data(), - m_gpuMatrix.N(), - m_diagInvFlattened.data()); + invertDiagonalAndFlatten(); +} + +template +void +CuJac::invertDiagonalAndFlatten() +{ + detail::JAC::invertDiagonalAndFlatten( + m_gpuMatrix.getNonZeroValues().data(), + m_gpuMatrix.getRowIndices().data(), + m_gpuMatrix.getColumnIndices().data(), + m_gpuMatrix.N(), + m_diagInvFlattened.data()); } } // namespace Opm::cuistl diff --git a/opm/simulators/linalg/cuistl/CuJac.hpp b/opm/simulators/linalg/cuistl/CuJac.hpp index 31990dc11..36015e467 100644 --- a/opm/simulators/linalg/cuistl/CuJac.hpp +++ b/opm/simulators/linalg/cuistl/CuJac.hpp @@ -103,6 +103,8 @@ private: CuSparseMatrix m_gpuMatrix; //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector CuVector m_diagInvFlattened; + + void invertDiagonalAndFlatten(); }; } // end namespace Opm::cuistl diff --git a/opm/simulators/linalg/cuistl/OpmCuILU0.cpp b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp new file mode 100644 index 000000000..c7fcbc708 --- /dev/null +++ b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp @@ -0,0 +1,318 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace Opm::cuistl +{ + +template +OpmCuILU0::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels) + : m_cpuMatrix(A) + , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) + , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) + , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets)) + , m_gpuMatrix(CuSparseMatrix::fromMatrix(m_cpuMatrix, true)) + , m_gpuMatrixReorderedLower(nullptr) + , m_gpuMatrixReorderedUpper(nullptr) + , m_gpuNaturalToReorder(m_naturalToReordered) + , m_gpuReorderToNatural(m_reorderedToNatural) + , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) + , m_splitMatrix(splitMatrix) + , m_tuneThreadBlockSizes(tuneKernels) +{ + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? + // Some sanity check + OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), + fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N())); + OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(), + fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.", + m_gpuMatrix.blockSize(), + A[0][0].N())); + OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(), + fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.", + m_gpuMatrix.dim(), + A.N() * A[0][0].N())); + OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(), + fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ", + m_gpuMatrix.nonzeroes(), + A.nonzeroes())); + if (m_splitMatrix) { + m_gpuMatrixReorderedDiag.emplace(CuVector(blocksize_ * blocksize_ * m_cpuMatrix.N())); + std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper) + = detail::extractLowerAndUpperMatrices>(m_cpuMatrix, + m_reorderedToNatural); + } else { + m_gpuReorderedLU = detail::createReorderedMatrix>( + m_cpuMatrix, m_reorderedToNatural); + } + LUFactorizeAndMoveData(); + +#ifdef USE_HIP + if (m_tuneThreadBlockSizes) { + tuneThreadBlockSizes(); + } +#endif +} + +template +void +OpmCuILU0::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b) +{ +} + +template +void +OpmCuILU0::apply(X& v, const Y& d) +{ + OPM_TIMEBLOCK(prec_apply); + { + int levelStartIdx = 0; + for (int level = 0; level < m_levelSets.size(); ++level) { + const int numOfRowsInLevel = m_levelSets[level].size(); + if (m_splitMatrix) { + detail::ILU0::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + d.data(), + v.data(), + m_applyThreadBlockSize); + } else { + detail::ILU0::solveLowerLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), + m_gpuReorderedLU->getRowIndices().data(), + m_gpuReorderedLU->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + d.data(), + v.data(), + m_applyThreadBlockSize); + } + levelStartIdx += numOfRowsInLevel; + } + + levelStartIdx = m_cpuMatrix.N(); + for (int level = m_levelSets.size() - 1; level >= 0; --level) { + const int numOfRowsInLevel = m_levelSets[level].size(); + levelStartIdx -= numOfRowsInLevel; + if (m_splitMatrix) { + detail::ILU0::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuMatrixReorderedDiag.value().data(), + v.data(), + m_applyThreadBlockSize); + } else { + detail::ILU0::solveUpperLevelSet(m_gpuReorderedLU->getNonZeroValues().data(), + m_gpuReorderedLU->getRowIndices().data(), + m_gpuReorderedLU->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + v.data(), + m_applyThreadBlockSize); + } + } + } +} + +template +void +OpmCuILU0::post([[maybe_unused]] X& x) +{ +} + +template +Dune::SolverCategory::Category +OpmCuILU0::category() const +{ + return Dune::SolverCategory::sequential; +} + +template +void +OpmCuILU0::update() +{ + OPM_TIMEBLOCK(prec_update); + { + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu + LUFactorizeAndMoveData(); + } +} + +template +void +OpmCuILU0::LUFactorizeAndMoveData() +{ + OPM_TIMEBLOCK(prec_update); + { + if (m_splitMatrix) { + detail::copyMatDataToReorderedSplit( + m_gpuMatrix.getNonZeroValues().data(), + m_gpuMatrix.getRowIndices().data(), + m_gpuMatrix.getColumnIndices().data(), + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedDiag.value().data(), + m_gpuNaturalToReorder.data(), + m_gpuMatrixReorderedLower->N(), + m_updateThreadBlockSize); + } else { + detail::copyMatDataToReordered(m_gpuMatrix.getNonZeroValues().data(), + m_gpuMatrix.getRowIndices().data(), + m_gpuReorderedLU->getNonZeroValues().data(), + m_gpuReorderedLU->getRowIndices().data(), + m_gpuNaturalToReorder.data(), + m_gpuReorderedLU->N(), + m_updateThreadBlockSize); + } + int levelStartIdx = 0; + for (int level = 0; level < m_levelSets.size(); ++level) { + const int numOfRowsInLevel = m_levelSets[level].size(); + + if (m_splitMatrix) { + detail::ILU0::LUFactorizationSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag.value().data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_updateThreadBlockSize); + + } else { + detail::ILU0::LUFactorization(m_gpuReorderedLU->getNonZeroValues().data(), + m_gpuReorderedLU->getRowIndices().data(), + m_gpuReorderedLU->getColumnIndices().data(), + m_gpuNaturalToReorder.data(), + m_gpuReorderToNatural.data(), + numOfRowsInLevel, + levelStartIdx, + m_updateThreadBlockSize); + } + levelStartIdx += numOfRowsInLevel; + } + } +} + +template +void +OpmCuILU0::tuneThreadBlockSizes() +{ + // TODO generalize this tuning process in a function separate of the class + long long bestApplyTime = std::numeric_limits::max(); + long long bestUpdateTime = std::numeric_limits::max(); + int bestApplyBlockSize = -1; + int bestUpdateBlockSize = -1; + int interval = 64; + + // temporary buffers for the apply + CuVector tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize()); + CuVector tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize()); + tmpD = 1; + + for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) { + // sometimes the first kernel launch kan be slower, so take the time twice + for (int i = 0; i < 2; ++i) { + + auto beforeUpdate = std::chrono::high_resolution_clock::now(); + m_updateThreadBlockSize = thrBlockSize; + update(); + std::ignore = cudaDeviceSynchronize(); + auto afterUpdate = std::chrono::high_resolution_clock::now(); + if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid + long long durationInMicroSec + = std::chrono::duration_cast(afterUpdate - beforeUpdate).count(); + if (durationInMicroSec < bestUpdateTime) { + bestUpdateTime = durationInMicroSec; + bestUpdateBlockSize = thrBlockSize; + } + } + + auto beforeApply = std::chrono::high_resolution_clock::now(); + m_applyThreadBlockSize = thrBlockSize; + apply(tmpV, tmpD); + std::ignore = cudaDeviceSynchronize(); + auto afterApply = std::chrono::high_resolution_clock::now(); + if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid + long long durationInMicroSec + = std::chrono::duration_cast(afterApply - beforeApply).count(); + if (durationInMicroSec < bestApplyTime) { + bestApplyTime = durationInMicroSec; + bestApplyBlockSize = thrBlockSize; + } + } + } + } + + m_applyThreadBlockSize = bestApplyBlockSize; + m_updateThreadBlockSize = bestUpdateBlockSize; +} + +} // namespace Opm::cuistl +#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \ + template class ::Opm::cuistl::OpmCuILU0>, \ + ::Opm::cuistl::CuVector, \ + ::Opm::cuistl::CuVector>; \ + template class ::Opm::cuistl::OpmCuILU0>, \ + ::Opm::cuistl::CuVector, \ + ::Opm::cuistl::CuVector> + +INSTANTIATE_CUDILU_DUNE(double, 1); +INSTANTIATE_CUDILU_DUNE(double, 2); +INSTANTIATE_CUDILU_DUNE(double, 3); +INSTANTIATE_CUDILU_DUNE(double, 4); +INSTANTIATE_CUDILU_DUNE(double, 5); +INSTANTIATE_CUDILU_DUNE(double, 6); + +INSTANTIATE_CUDILU_DUNE(float, 1); +INSTANTIATE_CUDILU_DUNE(float, 2); +INSTANTIATE_CUDILU_DUNE(float, 3); +INSTANTIATE_CUDILU_DUNE(float, 4); +INSTANTIATE_CUDILU_DUNE(float, 5); +INSTANTIATE_CUDILU_DUNE(float, 6); diff --git a/opm/simulators/linalg/cuistl/OpmCuILU0.hpp b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp new file mode 100644 index 000000000..d4877178e --- /dev/null +++ b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp @@ -0,0 +1,139 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_CUILU0_OPM_Impl_HPP +#define OPM_CUILU0_OPM_Impl_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace Opm::cuistl +{ +//! \brief ILU0 preconditioner on the GPU. +//! +//! \tparam M The matrix type to operate on +//! \tparam X Type of the update +//! \tparam Y Type of the defect +//! \tparam l Ignored. Just there to have the same number of template arguments +//! as other preconditioners. +//! +//! \note We assume X and Y are both CuVector, but we leave them as template +//! arguments in case of future additions. +template +class OpmCuILU0 : public Dune::PreconditionerWithUpdate +{ +public: + //! \brief The matrix type the preconditioner is for. + using matrix_type = typename std::remove_const::type; + //! \brief The domain type of the preconditioner. + using domain_type = X; + //! \brief The range type of the preconditioner. + using range_type = Y; + //! \brief The field type of the preconditioner. + using field_type = typename X::field_type; + //! \brief The GPU matrix type + using CuMat = CuSparseMatrix; + + //! \brief Constructor. + //! + //! Constructor gets all parameters to operate the prec. + //! \param A The matrix to operate on. + //! \param w The relaxation factor. + //! + explicit OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels); + + //! \brief Prepare the preconditioner. + //! \note Does nothing at the time being. + void pre(X& x, Y& b) override; + + //! \brief Apply the preconditoner. + void apply(X& v, const Y& d) override; + + //! \brief Post processing + //! \note Does nothing at the moment + void post(X& x) override; + + //! Category of the preconditioner (see SolverCategory::Category) + Dune::SolverCategory::Category category() const override; + + //! \brief Updates the matrix data. + void update() final; + + //! \brief Compute LU factorization, and update the data of the reordered matrix + void LUFactorizeAndMoveData(); + + //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels + void tuneThreadBlockSizes(); + + //! \returns false + static constexpr bool shouldCallPre() + { + return false; + } + + //! \returns false + static constexpr bool shouldCallPost() + { + return false; + } + + +private: + //! \brief Reference to the underlying matrix + const M& m_cpuMatrix; + //! \brief size_t describing the dimensions of the square block elements + static constexpr const size_t blocksize_ = matrix_type::block_type::cols; + //! \brief SparseTable storing each row by level + Opm::SparseTable m_levelSets; + //! \brief converts from index in reordered structure to index natural ordered structure + std::vector m_reorderedToNatural; + //! \brief converts from index in natural ordered structure to index reordered strucutre + std::vector m_naturalToReordered; + //! \brief The A matrix stored on the gpu, and its reordred version + CuMat m_gpuMatrix; + std::unique_ptr m_gpuReorderedLU; + //! \brief If matrix splitting is enabled, then we store the lower and upper part separately + std::unique_ptr m_gpuMatrixReorderedLower; + std::unique_ptr m_gpuMatrixReorderedUpper; + //! \brief If matrix splitting is enabled, we also store the diagonal separately + std::optional> m_gpuMatrixReorderedDiag; + //! row conversion from natural to reordered matrix indices stored on the GPU + CuVector m_gpuNaturalToReorder; + //! row conversion from reordered to natural matrix indices stored on the GPU + CuVector m_gpuReorderToNatural; + //! \brief Stores the inverted diagonal that we use in ILU0 + CuVector m_gpuDInv; + //! \brief Bool storing whether or not we should store matrices in a split format + bool m_splitMatrix; + //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards + bool m_tuneThreadBlockSizes; + //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards + //! The default value of -1 indicates that we have not calibrated and selected a value yet + int m_applyThreadBlockSize = -1; + int m_updateThreadBlockSize = -1; +}; +} // end namespace Opm::cuistl + +#endif diff --git a/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp b/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp new file mode 100644 index 000000000..a45c8824e --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp @@ -0,0 +1,110 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_COLORING_AND_REORDERING_UTILS_HPP +#define OPM_COLORING_AND_REORDERING_UTILS_HPP + +#include +#include +#include +#include +#include +#include +#include +/* +This file contains a collection of utility functions used in the GPU implementation of ILU and DILU +The functions deal with creating the mappings between reordered and natural indices, as well as +extracting sparsity structures from dune matrices and creating cusparsematrix indices +*/ +namespace Opm::cuistl::detail +{ +inline std::vector +createReorderedToNatural(const Opm::SparseTable& levelSets) +{ + auto res = std::vector(Opm::cuistl::detail::to_size_t(levelSets.dataSize())); + int globCnt = 0; + for (auto row : levelSets) { + for (auto col : row) { + OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(), + fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size())); + res[globCnt++] = static_cast(col); + } + } + return res; +} + +inline std::vector +createNaturalToReordered(const Opm::SparseTable& levelSets) +{ + auto res = std::vector(Opm::cuistl::detail::to_size_t(levelSets.dataSize())); + int globCnt = 0; + for (auto row : levelSets) { + for (auto col : row) { + OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(), + fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size())); + res[col] = globCnt++; + } + } + return res; +} + +template +inline std::unique_ptr +createReorderedMatrix(const M& naturalMatrix, const std::vector& reorderedToNatural) +{ + M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise); + for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) { + auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()]; + // For elements in A + for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) { + dstRowIt.insert(elem.index()); + } + } + return std::unique_ptr(new auto(GPUM::fromMatrix(reorderedMatrix, true))); +} + +template +inline std::tuple, std::unique_ptr> +extractLowerAndUpperMatrices(const M& naturalMatrix, const std::vector& reorderedToNatural) +{ + const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2; + + M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise); + M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise); + + for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin(); + lowerIt != reorderedLower.createend(); + ++lowerIt, ++upperIt) { + + auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()]; + + for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) { + if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal + lowerIt.insert(elem.index()); + } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal + upperIt.insert(elem.index()); + } + } + } + + return {std::unique_ptr(new auto(GPUM::fromMatrix(reorderedLower, true))), + std::unique_ptr(new auto(GPUM::fromMatrix(reorderedUpper, true)))}; +} +} // namespace Opm::cuistl::detail + +#endif diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu index 34bccf533..46c249586 100644 --- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu +++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu @@ -16,425 +16,17 @@ You should have received a copy of the GNU General Public License along with OPM. If not, see . */ +#include #include #include +#include +#include #include -#include namespace Opm::cuistl::detail { namespace { - - // TODO: figure out if this can be generalized effectively, this seems excessively verbose - // explicit formulas based on Dune cpu code - template - __device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock) - { - if (blocksize == 1) { - dstBlock[0] = 1.0 / (srcBlock[0]); - } else if (blocksize == 2) { - T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]); - dstBlock[0] = srcBlock[3] * detInv; - dstBlock[1] = -srcBlock[1] * detInv; - dstBlock[2] = -srcBlock[2] * detInv; - dstBlock[3] = srcBlock[0] * detInv; - } else if (blocksize == 3) { - // based on Dune implementation - T t4 = srcBlock[0] * srcBlock[4]; - T t6 = srcBlock[0] * srcBlock[5]; - T t8 = srcBlock[1] * srcBlock[3]; - T t10 = srcBlock[2] * srcBlock[3]; - T t12 = srcBlock[1] * srcBlock[6]; - T t14 = srcBlock[2] * srcBlock[6]; - - T t17 = 1.0 - / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5] - - t14 * srcBlock[4]); // t17 is 1/determinant - - dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17; - dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17; - dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17; - dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17; - dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17; - dstBlock[5] = -(t6 - t10) * t17; - dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17; - dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17; - dstBlock[8] = (t4 - t8) * t17; - } - } - - // explicit formulas based on Dune cpu code - template - __device__ __forceinline__ void invBlockInPlace(T* __restrict__ block) - { - if (blocksize == 1) { - block[0] = 1.0 / (block[0]); - } else if (blocksize == 2) { - T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]); - - T temp = block[0]; - block[0] = block[3] * detInv; - block[1] = -block[1] * detInv; - block[2] = -block[2] * detInv; - block[3] = temp * detInv; - } else if (blocksize == 3) { - T t4 = block[0] * block[4]; - T t6 = block[0] * block[5]; - T t8 = block[1] * block[3]; - T t10 = block[2] * block[3]; - T t12 = block[1] * block[6]; - T t14 = block[2] * block[6]; - - T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]); - T t17 = T(1.0) / det; - - T matrix01 = block[1]; - T matrix00 = block[0]; - T matrix10 = block[3]; - T matrix11 = block[4]; - - block[0] = (block[4] * block[8] - block[5] * block[7]) * t17; - block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17; - block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17; - block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17; - block[4] = (matrix00 * block[8] - t14) * t17; - block[5] = -(t6 - t10) * t17; - block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17; - block[7] = -(matrix00 * block[7] - t12) * t17; - block[8] = (t4 - t8) * t17; - } - } - - enum class MVType { SET, PLUS, MINUS }; - // SET: c = A*b - // PLS: c += A*b - // MINUS: c -= A*b - template - __device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c) - { - for (int i = 0; i < blocksize; ++i) { - if (OP == MVType::SET) { - c[i] = 0; - } - - for (int j = 0; j < blocksize; ++j) { - if (OP == MVType::SET || OP == MVType::PLUS) { - c[i] += A[i * blocksize + j] * b[j]; - } else if (OP == MVType::MINUS) { - c[i] -= A[i * blocksize + j] * b[j]; - } - } - } - } - - template - __device__ __forceinline__ void mv(const T* a, const T* b, T* c) - { - matrixVectorProductWithAction(a, b, c); - } - - template - __device__ __forceinline__ void umv(const T* a, const T* b, T* c) - { - matrixVectorProductWithAction(a, b, c); - } - - template - __device__ __forceinline__ void mmv(const T* a, const T* b, T* c) - { - matrixVectorProductWithAction(a, b, c); - } - - // dst -= A*B*C - template - __device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst) - { - - T tmp[blocksize * blocksize] = {0}; - // tmp = A*B - for (int i = 0; i < blocksize; ++i) { - for (int k = 0; k < blocksize; ++k) { - for (int j = 0; j < blocksize; ++j) { - tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j]; - } - } - } - - // dst = tmp*C - for (int i = 0; i < blocksize; ++i) { - for (int k = 0; k < blocksize; ++k) { - for (int j = 0; j < blocksize; ++j) { - dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j]; - } - } - } - } - - template - __global__ void - cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) - { - const auto row = blockDim.x * blockIdx.x + threadIdx.x; - - if (row < numberOfRows) { - size_t nnzIdx = rowIndices[row]; - size_t nnzIdxLim = rowIndices[row + 1]; - - // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a - // zero element - while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) { - ++nnzIdx; - } - - // diagBlock points to the start of where the diagonal block is stored - T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx]; - // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block - // element should be stored - T* vecBlock = &vec[blocksize * blocksize * row]; - - invBlockOutOfPlace(diagBlock, vecBlock); - } - } - - template - __global__ void cuComputeLowerSolveLevelSet(T* mat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - const T* d, - T* v) - { - const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); - if (reorderedRowIdx < rowsInLevelSet + startIdx) { - - const size_t nnzIdx = rowIndices[reorderedRowIdx]; - const int naturalRowIdx = indexConversion[reorderedRowIdx]; - - T rhs[blocksize]; - for (int i = 0; i < blocksize; i++) { - rhs[i] = d[naturalRowIdx * blocksize + i]; - } - - for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) { - const int col = colIndices[block]; - mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); - } - - mv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); - } - } - - template - __global__ void cuComputeLowerSolveLevelSetSplit(T* mat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - const T* d, - T* v) - { - const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); - if (reorderedRowIdx < rowsInLevelSet + startIdx) { - - const size_t nnzIdx = rowIndices[reorderedRowIdx]; - const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; - const int naturalRowIdx = indexConversion[reorderedRowIdx]; - - T rhs[blocksize]; - for (int i = 0; i < blocksize; i++) { - rhs[i] = d[naturalRowIdx * blocksize + i]; - } - - // TODO: removce the first condition in the for loop - for (int block = nnzIdx; block < nnzIdxLim; ++block) { - const int col = colIndices[block]; - mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); - } - - mv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); - } - } - - - template - __global__ void cuComputeUpperSolveLevelSet(T* mat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - T* v) - { - const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); - if (reorderedRowIdx < rowsInLevelSet + startIdx) { - const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; - const int naturalRowIdx = indexConversion[reorderedRowIdx]; - - T rhs[blocksize] = {0}; - for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) { - const int col = colIndices[block]; - umv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); - } - - mmv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); - } - } - - template - __global__ void cuComputeUpperSolveLevelSetSplit(T* mat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - T* v) - { - const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); - if (reorderedRowIdx < rowsInLevelSet + startIdx) { - const size_t nnzIdx = rowIndices[reorderedRowIdx]; - const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; - const int naturalRowIdx = indexConversion[reorderedRowIdx]; - - T rhs[blocksize] = {0}; - for (int block = nnzIdx; block < nnzIdxLim; ++block) { - const int col = colIndices[block]; - umv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); - } - - mmv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); - } - } - - template - __global__ void cuComputeDiluDiagonal(T* mat, - int* rowIndices, - int* colIndices, - int* reorderedToNatural, - int* naturalToReordered, - const int startIdx, - int rowsInLevelSet, - T* dInv) - { - const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; - if (reorderedRowIdx < rowsInLevelSet + startIdx) { - const int naturalRowIdx = reorderedToNatural[reorderedRowIdx]; - const size_t nnzIdx = rowIndices[reorderedRowIdx]; - - int diagIdx = nnzIdx; - while (colIndices[diagIdx] != naturalRowIdx) { - ++diagIdx; - } - - T dInvTmp[blocksize * blocksize]; - for (int i = 0; i < blocksize; ++i) { - for (int j = 0; j < blocksize; ++j) { - dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j]; - } - } - - for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) { - const int col = naturalToReordered[colIndices[block]]; - // find element with indices swapped - // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range - // we binary search over - int left = rowIndices[col]; - int right = rowIndices[col + 1] - 1; - int mid = left + (right - left) / 2; // overflow-safe average; - - while (left <= right) { - const int col = colIndices[mid]; - - if (col == naturalRowIdx) { - break; - } else if (col < naturalRowIdx) { - left = mid + 1; - } else { - right = mid - 1; - } - mid = left + (right - left) / 2; // overflow-safe average - } - - const int symOpposite = mid; - - mmx2Subtraction(&mat[block * blocksize * blocksize], - &dInv[col * blocksize * blocksize], - &mat[symOpposite * blocksize * blocksize], - dInvTmp); - } - - invBlockInPlace(dInvTmp); - - for (int i = 0; i < blocksize; ++i) { - for (int j = 0; j < blocksize; ++j) { - dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; - } - } - } - } - - template - __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat, - int* lowerRowIndices, - int* lowerColIndices, - T* reorderedUpperMat, - int* upperRowIndices, - int* upperColIndices, - T* diagonal, - int* reorderedToNatural, - int* naturalToReordered, - const int startIdx, - int rowsInLevelSet, - T* dInv) - { - const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; - if (reorderedRowIdx < rowsInLevelSet + startIdx) { - const int naturalRowIdx = reorderedToNatural[reorderedRowIdx]; - const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx]; - const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1]; - - T dInvTmp[blocksize * blocksize]; - for (int i = 0; i < blocksize; ++i) { - for (int j = 0; j < blocksize; ++j) { - dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; - } - } - - for (int block = lowerRowStart; block < lowerRowEnd; ++block) { - const int col = naturalToReordered[lowerColIndices[block]]; - - int symOppositeIdx = upperRowIndices[col]; - for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) { - if (naturalRowIdx == upperColIndices[symOppositeIdx]) { - break; - } - } - - const int symOppositeBlock = symOppositeIdx; - - mmx2Subtraction(&reorderedLowerMat[block * blocksize * blocksize], - &dInv[col * blocksize * blocksize], - &reorderedUpperMat[symOppositeBlock * blocksize * blocksize], - dInvTmp); - } - - invBlockInPlace(dInvTmp); - - for (int i = 0; i < blocksize; ++i) { - for (int j = 0; j < blocksize; ++j) { - dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; - } - } - } - } - template __global__ void cuMoveDataToReordered( T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows) @@ -505,195 +97,21 @@ namespace } } } - - // Kernel here is the function object of the cuda kernel - template - inline int getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize=-1) - { - if (suggestedThrBlockSize != -1){ - return suggestedThrBlockSize; - } -// Use cuda API to maximize occupancy, otherwise we just pick a thread block size if it is not tuned -#if USE_HIP - return 512; -#else - int blockSize = 1024; - int tmpGridSize; - std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0); - return blockSize; -#endif - } - - inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize) - { - return (wantedThreads + threadBlockSize - 1) / threadBlockSize; - } - } // namespace template void -invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) +copyMatDataToReordered(T* srcMatrix, + int* srcRowIndices, + T* dstMatrix, + int* dstRowIndices, + int* naturalToReordered, + size_t numberOfRows, + int thrBlockSize) { - if (blocksize <= 3) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten); - int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize); - cuInvertDiagonalAndFlatten - <<>>(mat, rowIndices, colIndices, numberOfRows, vec); - } else { - OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); - } -} - -// perform the lower solve for all rows in the same level set -template -void -computeLowerSolveLevelSet(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - const T* d, - T* v, - int thrBlockSize) -{ - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSet, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeLowerSolveLevelSet<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); -} - - -template -void -computeLowerSolveLevelSetSplit(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - const T* d, - T* v, - int thrBlockSize) -{ - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeLowerSolveLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); -} -// perform the upper solve for all rows in the same level set -template -void -computeUpperSolveLevelSet(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - T* v, - int thrBlockSize) -{ - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSet, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeUpperSolveLevelSet<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); -} - -template -void -computeUpperSolveLevelSetSplit(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - T* v, - int thrBlockSize) -{ - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSetSplit, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeUpperSolveLevelSetSplit<<>>( - reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); -} - -template -void -computeDiluDiagonal(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* reorderedToNatural, - int* naturalToReordered, - const int startIdx, - int rowsInLevelSet, - T* dInv, - int thrBlockSize) -{ - if (blocksize <= 3) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonal, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonal - <<>>(reorderedMat, - rowIndices, - colIndices, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv); - } else { - OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); - } -} - -template -void -computeDiluDiagonalSplit(T* reorderedLowerMat, - int* lowerRowIndices, - int* lowerColIndices, - T* reorderedUpperMat, - int* upperRowIndices, - int* upperColIndices, - T* diagonal, - int* reorderedToNatural, - int* naturalToReordered, - const int startIdx, - int rowsInLevelSet, - T* dInv, - int thrBlockSize) -{ - if (blocksize <= 3) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonalSplit, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonalSplit<<>>(reorderedLowerMat, - lowerRowIndices, - lowerColIndices, - reorderedUpperMat, - upperRowIndices, - upperColIndices, - diagonal, - reorderedToNatural, - naturalToReordered, - startIdx, - rowsInLevelSet, - dInv); - } else { - OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); - } -} - -template -void -copyMatDataToReordered( - T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, - int thrBlockSize) -{ - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReordered, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize); + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); cuMoveDataToReordered<<>>( srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows); } @@ -712,8 +130,9 @@ copyMatDataToReorderedSplit(T* srcMatrix, size_t numberOfRows, int thrBlockSize) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReorderedSplit, thrBlockSize); - int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize); + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuMoveDataToReorderedSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); cuMoveDataToReorderedSplit<<>>(srcMatrix, srcRowIndices, srcColumnIndices, @@ -727,16 +146,8 @@ copyMatDataToReorderedSplit(T* srcMatrix, } #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ - template void invertDiagonalAndFlatten(T*, int*, int*, size_t, T*); \ - template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t, int); \ - template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); \ - template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ - template void computeDiluDiagonalSplit( \ - T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \ - template void computeUpperSolveLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void computeLowerSolveLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \ - template void computeUpperSolveLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void computeLowerSolveLevelSetSplit(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); + template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t, int); \ + template void copyMatDataToReorderedSplit(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp index 5e48539d1..e2c161092 100644 --- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp +++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp @@ -22,189 +22,6 @@ #include namespace Opm::cuistl::detail { - -/** - * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec - * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix - * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format - * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format - * @param numberOfRows Integer describing the number of rows in the matrix - * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored - */ -template -void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec); - -/** - * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel - * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous - * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format - * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format - * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index - * in the natural ordered matrix - * @param startIdx Index of the first row of the matrix to be solve - * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this - * function - * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as - * reorderedMat - * @param d Stores the defect - * @param [out] v Will store the results of the lower solve - */ -template -void computeLowerSolveLevelSet(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - const T* d, - T* v, - int threadBlockSize); - -/** - * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel - * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular - * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format - * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format - * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index - * in the natural ordered matrix - * @param startIdx Index of the first row of the matrix to be solve - * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this - * function - * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as - * reorderedUpperMat - * @param d Stores the defect - * @param [out] v Will store the results of the lower solve - */ -template -void computeLowerSolveLevelSetSplit(T* reorderedUpperMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - const T* d, - T* v, - int threadBlockSize); - -/** - * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel - * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous - * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format - * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format - * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index - * in the natural ordered matrix - * @param startIdx Index of the first row of the matrix to be solve - * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this - * function - * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner - * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower - * solve - */ -template -void computeUpperSolveLevelSet(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - T* v, - int threadBlockSize); -template - -/** - * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel - * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular - * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format - * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format - * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index - * in the natural ordered matrix - * @param startIdx Index of the first row of the matrix to be solve - * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this - * function - * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner - * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower - * solve - */ -void computeUpperSolveLevelSetSplit(T* reorderedUpperMat, - int* rowIndices, - int* colIndices, - int* indexConversion, - int startIdx, - int rowsInLevelSet, - const T* dInv, - T* v, - int threadBlockSize); - -/** - * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector - * containing the diagonal blocks - * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous - * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format - * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format - * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding - * index in the natural ordered matrix - * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding - * index in the natural ordered matrix - * @param startIdx Index of the first row of the matrix to be solve - * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this - * function - * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner - */ -template -void computeDiluDiagonal(T* reorderedMat, - int* rowIndices, - int* colIndices, - int* reorderedToNatural, - int* naturalToReordered, - int startIdx, - int rowsInLevelSet, - T* dInv, - int threadBlockSize); -template - -/** - * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector - * containing the diagonal blocks - * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous - * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format - * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format - * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse matrix. The matrix reordered such - * that rows in the same level sets are contiguous - * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format - * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format - * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding - * index in the natural ordered matrix - * @param diagonal The diagonal elements of the reordered matrix - * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding - * index in the natural ordered matrix - * @param startIdx Index of the first row of the matrix to be solve - * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this - * function - * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner - */ -void computeDiluDiagonalSplit(T* reorderedLowerMat, - int* lowerRowIndices, - int* lowerColIndices, - T* reorderedUpperMat, - int* upperRowIndices, - int* upperColIndices, - T* diagonal, - int* reorderedToNatural, - int* naturalToReordered, - int startIdx, - int rowsInLevelSet, - T* dInv, - int threadBlockSize); - /** * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list * @param srcMatrix The source matrix we will copy data from @@ -216,26 +33,40 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat, * @param numberOfRows The number of rows in the matrices */ template -void copyMatDataToReordered( - T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, int threadBlockSize); +void copyMatDataToReordered(T* srcMatrix, + int* srcRowIndices, + T* dstMatrix, + int* dstRowIndices, + int* naturalToReordered, + size_t numberOfRows, + int threadBlockSize); /** * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list * @param srcMatrix The source matrix we will copy data from * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix - * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht bsr - * format + * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht + * bsr format * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix - * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht bsr - * format + * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht + * bsr format * @param [out] dstDiag The destination buffer for the diagonal part of the matrix * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix * @param numberOfRows The number of rows in the matrices */ template -void copyMatDataToReorderedSplit( - T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows, int threadBlockSize); +void copyMatDataToReorderedSplit(T* srcMatrix, + int* srcRowIndices, + int* srcColumnIndices, + T* dstLowerMatrix, + int* dstLowerRowIndices, + T* dstUpperMatrix, + int* dstUpperRowIndices, + T* dstDiag, + int* naturalToReordered, + size_t numberOfRows, + int threadBlockSize); } // namespace Opm::cuistl::detail #endif diff --git a/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp b/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp new file mode 100644 index 000000000..b3475591a --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp @@ -0,0 +1,234 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP +#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP + +#include +#include + +/* + This file provides inlineable functions intended for CUDA kernels operating on block matrix elements + The functions provides various matrix operations that are used by the preconditioners. +*/ + +namespace +{ +// TODO: figure out if this can be generalized effectively, this seems excessively verbose +// explicit formulas based on Dune cpu code +template +__device__ __forceinline__ void +invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock) +{ + if (blocksize == 1) { + dstBlock[0] = 1.0 / (srcBlock[0]); + } else if (blocksize == 2) { + T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]); + dstBlock[0] = srcBlock[3] * detInv; + dstBlock[1] = -srcBlock[1] * detInv; + dstBlock[2] = -srcBlock[2] * detInv; + dstBlock[3] = srcBlock[0] * detInv; + } else if (blocksize == 3) { + // based on Dune implementation + T t4 = srcBlock[0] * srcBlock[4]; + T t6 = srcBlock[0] * srcBlock[5]; + T t8 = srcBlock[1] * srcBlock[3]; + T t10 = srcBlock[2] * srcBlock[3]; + T t12 = srcBlock[1] * srcBlock[6]; + T t14 = srcBlock[2] * srcBlock[6]; + + T t17 = 1.0 + / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5] + - t14 * srcBlock[4]); // t17 is 1/determinant + + dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17; + dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17; + dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17; + dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17; + dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17; + dstBlock[5] = -(t6 - t10) * t17; + dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17; + dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17; + dstBlock[8] = (t4 - t8) * t17; + } +} + +// explicit formulas based on Dune cpu code +template +__device__ __forceinline__ void +invBlockInPlace(T* __restrict__ block) +{ + if (blocksize == 1) { + block[0] = 1.0 / (block[0]); + } else if (blocksize == 2) { + T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]); + + T temp = block[0]; + block[0] = block[3] * detInv; + block[1] = -block[1] * detInv; + block[2] = -block[2] * detInv; + block[3] = temp * detInv; + } else if (blocksize == 3) { + T t4 = block[0] * block[4]; + T t6 = block[0] * block[5]; + T t8 = block[1] * block[3]; + T t10 = block[2] * block[3]; + T t12 = block[1] * block[6]; + T t14 = block[2] * block[6]; + + T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]); + T t17 = T(1.0) / det; + + T matrix01 = block[1]; + T matrix00 = block[0]; + T matrix10 = block[3]; + T matrix11 = block[4]; + + block[0] = (block[4] * block[8] - block[5] * block[7]) * t17; + block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17; + block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17; + block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17; + block[4] = (matrix00 * block[8] - t14) * t17; + block[5] = -(t6 - t10) * t17; + block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17; + block[7] = -(matrix00 * block[7] - t12) * t17; + block[8] = (t4 - t8) * t17; + } +} + +enum class MVType { SET, PLUS, MINUS }; +// SET: c = A*b +// PLS: c += A*b +// MINUS: c -= A*b +template +__device__ __forceinline__ void +matrixVectorProductWithAction(const T* A, const T* b, T* c) +{ + for (int i = 0; i < blocksize; ++i) { + if (OP == MVType::SET) { + c[i] = 0; + } + + for (int j = 0; j < blocksize; ++j) { + if (OP == MVType::SET || OP == MVType::PLUS) { + c[i] += A[i * blocksize + j] * b[j]; + } else if (OP == MVType::MINUS) { + c[i] -= A[i * blocksize + j] * b[j]; + } + } + } +} + +template +__device__ __forceinline__ void +mv(const T* a, const T* b, T* c) +{ + matrixVectorProductWithAction(a, b, c); +} + +template +__device__ __forceinline__ void +umv(const T* a, const T* b, T* c) +{ + matrixVectorProductWithAction(a, b, c); +} + +template +__device__ __forceinline__ void +mmv(const T* a, const T* b, T* c) +{ + matrixVectorProductWithAction(a, b, c); +} + +// dst -= A*B*C +template +__device__ __forceinline__ void +mmx2Subtraction(T* A, T* B, T* C, T* dst) +{ + + T tmp[blocksize * blocksize] = {0}; + // tmp = A*B + for (int i = 0; i < blocksize; ++i) { + for (int k = 0; k < blocksize; ++k) { + for (int j = 0; j < blocksize; ++j) { + tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j]; + } + } + } + + // dst = tmp*C + for (int i = 0; i < blocksize; ++i) { + for (int k = 0; k < blocksize; ++k) { + for (int j = 0; j < blocksize; ++j) { + dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j]; + } + } + } +} + +// C = A*B, assumes the three buffers do not overlap +template +__device__ __forceinline__ void +mmNoOverlap(T* A, T* B, T* C) +{ + for (int i = 0; i < blocksize; ++i) { + for (int k = 0; k < blocksize; ++k) { + for (int j = 0; j < blocksize; ++j) { + C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j]; + } + } + } +} + +// C = A*B, buffers may overlap +template +__device__ __forceinline__ void +mmOverlap(T* A, T* B, T* C) +{ + T tmp[blocksize * blocksize] = {0}; + for (int i = 0; i < blocksize; ++i) { + for (int k = 0; k < blocksize; ++k) { + for (int j = 0; j < blocksize; ++j) { + tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j]; + } + } + } + + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + C[i * blocksize + j] = tmp[i * blocksize + j]; + } + } +} + +// A -= B +template +__device__ __forceinline__ void +matrixSubtraction(T* A, T* B) +{ + + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + A[i * blocksize + j] -= B[i * blocksize + j]; + } + } +} +} // namespace + +#endif diff --git a/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp b/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp new file mode 100644 index 000000000..28b5a679a --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp @@ -0,0 +1,66 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_GPU_THREAD_UTILS_HPP +#define OPM_GPU_THREAD_UTILS_HPP +#include +#include +#include +#include + +/* + This file provides some logic for handling how to choose the correct thread-block size +*/ +namespace Opm::cuistl::detail +{ +constexpr inline size_t +getThreads([[maybe_unused]] size_t numberOfRows) +{ + return 1024; +} + +inline size_t +getBlocks(size_t numberOfRows) +{ + const auto threads = getThreads(numberOfRows); + return (numberOfRows + threads - 1) / threads; +} + +// Kernel here is the function object of the cuda kernel +template +inline int +getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize = -1) +{ + if (suggestedThrBlockSize != -1) { + return suggestedThrBlockSize; + } + int blockSize; + int tmpGridSize; + OPM_CUDA_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0)); + return blockSize; +} + +inline int +getNumberOfBlocks(int wantedThreads, int threadBlockSize) +{ + return (wantedThreads + threadBlockSize - 1) / threadBlockSize; +} + +} // namespace Opm::cuistl::detail + +#endif diff --git a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu new file mode 100644 index 000000000..800a7c0cd --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu @@ -0,0 +1,437 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +#include +#include +#include +#include +#include + +namespace Opm::cuistl::detail::DILU +{ +namespace +{ + + template + __global__ void cuSolveLowerLevelSet(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v) + { + const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + + const size_t nnzIdx = rowIndices[reorderedRowIdx]; + const int naturalRowIdx = indexConversion[reorderedRowIdx]; + + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = d[naturalRowIdx * blocksize + i]; + } + + for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } + + template + __global__ void cuSolveLowerLevelSetSplit(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v) + { + const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + + const size_t nnzIdx = rowIndices[reorderedRowIdx]; + const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedRowIdx]; + + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = d[naturalRowIdx * blocksize + i]; + } + + // TODO: removce the first condition in the for loop + for (int block = nnzIdx; block < nnzIdxLim; ++block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } + + + template + __global__ void cuSolveUpperLevelSet(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v) + { + const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedRowIdx]; + + T rhs[blocksize] = {0}; + for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) { + const int col = colIndices[block]; + umv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mmv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } + + template + __global__ void cuSolveUpperLevelSetSplit(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v) + { + const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + const size_t nnzIdx = rowIndices[reorderedRowIdx]; + const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedRowIdx]; + + T rhs[blocksize] = {0}; + for (int block = nnzIdx; block < nnzIdxLim; ++block) { + const int col = colIndices[block]; + umv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mmv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } + + template + __global__ void cuComputeDiluDiagonal(T* mat, + int* rowIndices, + int* colIndices, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + T* dInv) + { + const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + const int naturalRowIdx = reorderedToNatural[reorderedRowIdx]; + const size_t nnzIdx = rowIndices[reorderedRowIdx]; + + int diagIdx = nnzIdx; + while (colIndices[diagIdx] != naturalRowIdx) { + ++diagIdx; + } + + T dInvTmp[blocksize * blocksize]; + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j]; + } + } + + for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) { + const int col = naturalToReordered[colIndices[block]]; + // find element with indices swapped + // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range + // we binary search over + int left = rowIndices[col]; + int right = rowIndices[col + 1] - 1; + int mid; + + while (left <= right) { + mid = left + (right - left) / 2; // overflow-safe average + const int col = colIndices[mid]; + + if (col == naturalRowIdx) { + break; + } else if (col < naturalRowIdx) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + const int symOpposite = mid; + + mmx2Subtraction(&mat[block * blocksize * blocksize], + &dInv[col * blocksize * blocksize], + &mat[symOpposite * blocksize * blocksize], + dInvTmp); + } + + invBlockInPlace(dInvTmp); + + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; + } + } + } + } + + template + __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat, + int* lowerRowIndices, + int* lowerColIndices, + T* reorderedUpperMat, + int* upperRowIndices, + int* upperColIndices, + T* diagonal, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + T* dInv) + { + const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + const int naturalRowIdx = reorderedToNatural[reorderedRowIdx]; + const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx]; + const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1]; + + T dInvTmp[blocksize * blocksize]; + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; + } + } + + for (int block = lowerRowStart; block < lowerRowEnd; ++block) { + const int col = naturalToReordered[lowerColIndices[block]]; + + int symOppositeIdx = upperRowIndices[col]; + for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) { + if (naturalRowIdx == upperColIndices[symOppositeIdx]) { + break; + } + } + + const int symOppositeBlock = symOppositeIdx; + + mmx2Subtraction(&reorderedLowerMat[block * blocksize * blocksize], + &dInv[col * blocksize * blocksize], + &reorderedUpperMat[symOppositeBlock * blocksize * blocksize], + dInvTmp); + } + + invBlockInPlace(dInvTmp); + + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; + } + } + } + } +} // namespace + +// perform the lower solve for all rows in the same level set +template +void +solveLowerLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v, + int thrBlockSize) +{ + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveLowerLevelSet<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); +} + + +template +void +solveLowerLevelSetSplit(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v, + int thrBlockSize) +{ + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuSolveLowerLevelSetSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveLowerLevelSetSplit<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); +} +// perform the upper solve for all rows in the same level set +template +void +solveUpperLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v, + int thrBlockSize) +{ + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveUpperLevelSet<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); +} + +template +void +solveUpperLevelSetSplit(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v, + int thrBlockSize) +{ + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuSolveUpperLevelSetSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveUpperLevelSetSplit<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); +} + +template +void +computeDiluDiagonal(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + T* dInv, + int thrBlockSize) +{ + if (blocksize <= 3) { + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuComputeDiluDiagonal, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuComputeDiluDiagonal<<>>(reorderedMat, + rowIndices, + colIndices, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv); + } else { + OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); + } +} + +template +void +computeDiluDiagonalSplit(T* reorderedLowerMat, + int* lowerRowIndices, + int* lowerColIndices, + T* reorderedUpperMat, + int* upperRowIndices, + int* upperColIndices, + T* diagonal, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + T* dInv, + int thrBlockSize) +{ + if (blocksize <= 3) { + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuComputeDiluDiagonalSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuComputeDiluDiagonalSplit<<>>(reorderedLowerMat, + lowerRowIndices, + lowerColIndices, + reorderedUpperMat, + upperRowIndices, + upperColIndices, + diagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv); + } else { + OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); + } +} + +#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ + template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ + template void computeDiluDiagonalSplit( \ + T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \ + template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ + template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \ + template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); \ + template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); + +INSTANTIATE_KERNEL_WRAPPERS(float, 1); +INSTANTIATE_KERNEL_WRAPPERS(float, 2); +INSTANTIATE_KERNEL_WRAPPERS(float, 3); +INSTANTIATE_KERNEL_WRAPPERS(float, 4); +INSTANTIATE_KERNEL_WRAPPERS(float, 5); +INSTANTIATE_KERNEL_WRAPPERS(float, 6); +INSTANTIATE_KERNEL_WRAPPERS(double, 1); +INSTANTIATE_KERNEL_WRAPPERS(double, 2); +INSTANTIATE_KERNEL_WRAPPERS(double, 3); +INSTANTIATE_KERNEL_WRAPPERS(double, 4); +INSTANTIATE_KERNEL_WRAPPERS(double, 5); +INSTANTIATE_KERNEL_WRAPPERS(double, 6); +} // namespace Opm::cuistl::detail::DILU diff --git a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp new file mode 100644 index 000000000..9660028ec --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -0,0 +1,202 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_DILU_KERNELS_HPP +#define OPM_DILU_KERNELS_HPP + +#include +#include +#include +#include + +namespace Opm::cuistl::detail::DILU +{ + +/** + * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as + * reorderedMat + * @param d Stores the defect + * @param [out] v Will store the results of the lower solve + */ +template +void solveLowerLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v, + int threadBlockSize); + +/** + * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered + * such that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as + * reorderedUpperMat + * @param d Stores the defect + * @param [out] v Will store the results of the lower solve + */ +template +void solveLowerLevelSetSplit(T* reorderedUpperMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v, + int threadBlockSize); + +/** + * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner + * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower + * solve + */ +template +void solveUpperLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v, + int threadBlockSize); + +/** + * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered + * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner + * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower + * solve + */ +template +void solveUpperLevelSetSplit(T* reorderedUpperMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v, + int threadBlockSize); + +/** + * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector + * containing the diagonal blocks + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner + */ +template +void computeDiluDiagonal(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* reorderedToNatural, + int* naturalToReordered, + int startIdx, + int rowsInLevelSet, + T* dInv, + int threadBlockSize); +template + +/** + * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector + * containing the diagonal blocks + * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse + * matrix. The matrix reordered such that rows in the same level sets are contiguous + * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format + * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format + * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse + * matrix. The matrix reordered such that rows in the same level sets are contiguous + * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format + * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format + * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param diagonal The diagonal elements of the reordered matrix + * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner + */ +void computeDiluDiagonalSplit(T* reorderedLowerMat, + int* lowerRowIndices, + int* lowerColIndices, + T* reorderedUpperMat, + int* upperRowIndices, + int* upperColIndices, + T* diagonal, + int* reorderedToNatural, + int* naturalToReordered, + int startIdx, + int rowsInLevelSet, + T* dInv, + int threadBlockSize); + +} // namespace Opm::cuistl::detail::DILU +#endif diff --git a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu new file mode 100644 index 000000000..ce047b854 --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -0,0 +1,476 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +#include +#include +#include +#include +#include + +/* + The LU factorization and apply step is written based on the Dune implementations +*/ + +namespace Opm::cuistl::detail::ILU0 +{ +namespace +{ + template + __global__ void cuLUFactorization(T* srcMatrix, + int* srcRowIndices, + int* srcColumnIndices, + int* naturalToReordered, + int* reorderedToNatual, + size_t rowsInLevelSet, + int startIdx) + { + auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; + constexpr int scalarsInBlock = blocksize * blocksize; + + if (reorderedIdx < rowsInLevelSet + startIdx) { + int naturalIdx = reorderedToNatual[reorderedIdx]; + // for each element under the diagonal + int endOfRowI = srcRowIndices[reorderedIdx + 1]; + int ij; + for (ij = srcRowIndices[reorderedIdx]; + ij < srcRowIndices[reorderedIdx + 1] && srcColumnIndices[ij] < naturalIdx; + ++ij) { + // consider the block we are looking at to be A_ij + // we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is + // already inverted find the A_jj block + int jj; // index in bcsr of the A_jj element + int j = naturalToReordered[srcColumnIndices[ij]]; + int startOfRowJ = srcRowIndices[j]; + int endOfRowJ = srcRowIndices[j + 1]; + for (int blockInRowJ = startOfRowJ; blockInRowJ < endOfRowJ; ++blockInRowJ) { + if (srcColumnIndices[blockInRowJ] == srcColumnIndices[ij]) { + jj = blockInRowJ; + break; + } + } + + // the DUNE implementation is inplace, and I need to take care to make sure + // this out of place implementation is equivalent + mmOverlap( + &srcMatrix[ij * scalarsInBlock], &srcMatrix[jj * scalarsInBlock], &srcMatrix[ij * scalarsInBlock]); + + // we have now accounted for an element under the diagonal and the diagonal element above it + // now iterate over the blocks that align in the + int jk = jj + 1; + int ik = ij + 1; + // this code is NOT gpu friendly, thread divergence will probably slow this down significantly... + while (ik != endOfRowI && jk != endOfRowJ) { + if (srcColumnIndices[ik] == srcColumnIndices[jk]) { + // A_jk = A_ij * A_jk + T tmp[scalarsInBlock] = {0}; + mmNoOverlap( + &srcMatrix[ij * scalarsInBlock], &srcMatrix[jk * scalarsInBlock], tmp); + matrixSubtraction(&srcMatrix[ik * scalarsInBlock], tmp); + ++ik; + ++jk; + } else { + if (srcColumnIndices[ik] < srcColumnIndices[jk]) { + ++ik; + } else { + ++jk; + } + } + } + } + invBlockInPlace(&srcMatrix[ij * scalarsInBlock]); + } + } + + // An index in the BCSR matrix can be in one of three states, above, on, or above the diagonal + enum class POSITION_TYPE { UNDER_DIAG, ON_DIAG, ABOVE_DIAG }; + // this handles incrementation of an index on a row that is in a lower/upper split datastructure. + // The point is that if we are in the lower or upper part we increment as usual. + // if we reach the diagonal, we update the state, the index is not needed. + // when we increment from the diagonal, we set it to be the first block in the upper part of that row + __device__ __forceinline__ void + incrementAcrossSplitStructure(int& idx, POSITION_TYPE& currentState, int lastLowerIdx, int firstUpperIdx) + { + if (currentState == POSITION_TYPE::UNDER_DIAG) { + if (idx != lastLowerIdx - 1) { // we should increment to be on the diagonal + ++idx; + } else { + currentState = POSITION_TYPE::ON_DIAG; + } + } else if (currentState == POSITION_TYPE::ON_DIAG) { + idx = firstUpperIdx; + currentState = POSITION_TYPE::ABOVE_DIAG; + } else { // currentState = POSITION_TYPE::ABOVE_DIAG + ++idx; + } + } + + template + __global__ void cuLUFactorizationSplit(T* reorderedLowerMat, + int* lowerRowIndices, + int* lowerColIndices, + T* reorderedUpperMat, + int* upperRowIndices, + int* upperColIndices, + T* diagonal, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet) + { + auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; + constexpr int scalarsInBlock = blocksize * blocksize; + + if (reorderedIdx < rowsInLevelSet + startIdx) { + // if (reorderedIdx < rowsInLevelSet){ + int naturalIdx = reorderedToNatural[reorderedIdx]; + // for each element under the diagonal + int endOfRowILower = lowerRowIndices[reorderedIdx + 1]; + int endOfRowIUpper = upperRowIndices[reorderedIdx + 1]; + int startOfRowILower = lowerRowIndices[reorderedIdx]; + int startOfRowIUpper = upperRowIndices[reorderedIdx]; + for (int ij = startOfRowILower; ij < endOfRowILower; ++ij) { + + // consider the block we are looking at to be A_ij + // we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is + // already inverted find the A_jj block + int j = naturalToReordered[lowerColIndices[ij]]; + int endOfRowJ = upperRowIndices[j + 1]; + int startOfRowJ = upperRowIndices[j]; + + // the DUNE implementation is inplace, and I need to take care to make sure + // this out of place implementation is equivalent + mmOverlap(&reorderedLowerMat[ij * scalarsInBlock], + &diagonal[j * scalarsInBlock], + &reorderedLowerMat[ij * scalarsInBlock]); + + // we have now accounted for an element under the diagonal and the diagonal element above it + // now iterate over the blocks that align in the + int jk = startOfRowJ; + POSITION_TYPE ikState = POSITION_TYPE::UNDER_DIAG; // it is guaranteed that the ij element we consider + // is under the diagonal + int ik = ij; + incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper); + // this code is NOT gpu friendly, thread divergence will probably slow this down significantly... + + // checking for ik != endOfRowUpper is not sufficient alone because + // the same block index might be found in the lower part of the matrix + while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) { + + T* ikBlockPtr; + int ikColumn; + if (ikState == POSITION_TYPE::UNDER_DIAG) { + ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock]; + ikColumn = lowerColIndices[ik]; + } else if (ikState == POSITION_TYPE::ON_DIAG) { + ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock]; + ikColumn = naturalIdx; + } else { // ikState == POSITION_TYPE::ABOVE_DIAG + ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock]; + ikColumn = upperColIndices[ik]; + } + + if (ikColumn == upperColIndices[jk]) { + // A_jk = A_ij * A_jk + T tmp[scalarsInBlock] = {0}; + + mmNoOverlap( + &reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp); + matrixSubtraction(ikBlockPtr, tmp); + incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper); + ; + ++jk; + } else { + if (ikColumn < upperColIndices[jk]) { + incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper); + ; + } else { + ++jk; + } + } + } + } + invBlockInPlace(&diagonal[reorderedIdx * scalarsInBlock]); + } + } + + template + __global__ void cuSolveLowerLevelSet(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* d, + T* v) + { + auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedIdx < rowsInLevelSet + startIdx) { + + const size_t nnzIdx = rowIndices[reorderedIdx]; + const int naturalRowIdx = indexConversion[reorderedIdx]; + + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = d[naturalRowIdx * blocksize + i]; + } + + int block = nnzIdx; + for (; colIndices[block] < naturalRowIdx; ++block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + for (int i = 0; i < blocksize; ++i) { + v[naturalRowIdx * blocksize + i] = rhs[i]; + } + } + } + + template + __global__ void cuSolveUpperLevelSet( + T* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, T* v) + { + auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedIdx < rowsInLevelSet + startIdx) { + + const size_t nnzIdxLim = rowIndices[reorderedIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedIdx]; + + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = v[naturalRowIdx * blocksize + i]; + } + + int block = nnzIdxLim - 1; + for (; colIndices[block] > naturalRowIdx; --block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mv(&mat[block * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } + + template + __global__ void cuSolveLowerLevelSetSplit(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* d, + T* v) + { + auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedIdx < rowsInLevelSet + startIdx) { + const size_t nnzIdx = rowIndices[reorderedIdx]; + const size_t nnzIdxLim = rowIndices[reorderedIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedIdx]; + + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = d[naturalRowIdx * blocksize + i]; + } + + for (int block = nnzIdx; block < nnzIdxLim; ++block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + for (int i = 0; i < blocksize; ++i) { + v[naturalRowIdx * blocksize + i] = rhs[i]; + } + } + } + + template + __global__ void cuSolveUpperLevelSetSplit(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v) + { + auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedIdx < rowsInLevelSet + startIdx) { + const size_t nnzIdx = rowIndices[reorderedIdx]; + const size_t nnzIdxLim = rowIndices[reorderedIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedIdx]; + + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = v[naturalRowIdx * blocksize + i]; + } + + for (int block = nnzIdx; block < nnzIdxLim; ++block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mv(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } +} // namespace + +template +void +solveLowerLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* d, + T* v, + int thrBlockSize) +{ + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveLowerLevelSet<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); +} +// perform the upper solve for all rows in the same level set +template +void +solveUpperLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + T* v, + int thrBlockSize) +{ + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveUpperLevelSet<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v); +} + +template +void +solveLowerLevelSetSplit(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* d, + T* v, + int thrBlockSize) +{ + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuSolveLowerLevelSetSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveLowerLevelSetSplit<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); +} +// perform the upper solve for all rows in the same level set +template +void +solveUpperLevelSetSplit(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v, + int thrBlockSize) +{ + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize( + cuSolveUpperLevelSetSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuSolveUpperLevelSetSplit<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); +} + +template +void +LUFactorization(T* srcMatrix, + int* srcRowIndices, + int* srcColumnIndices, + int* naturalToReordered, + int* reorderedToNatual, + size_t rowsInLevelSet, + int startIdx, + int thrBlockSize) +{ + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuLUFactorization<<>>( + srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); +} + +template +void +LUFactorizationSplit(T* reorderedLowerMat, + int* lowerRowIndices, + int* lowerColIndices, + T* reorderedUpperMat, + int* upperRowIndices, + int* upperColIndices, + T* diagonal, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + int thrBlockSize) +{ + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit, thrBlockSize); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); + cuLUFactorizationSplit<<>>(reorderedLowerMat, + lowerRowIndices, + lowerColIndices, + reorderedUpperMat, + upperRowIndices, + upperColIndices, + diagonal, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet); +} + +#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ + template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int); \ + template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ + template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int); \ + template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); \ + template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); + +INSTANTIATE_KERNEL_WRAPPERS(float, 1); +INSTANTIATE_KERNEL_WRAPPERS(float, 2); +INSTANTIATE_KERNEL_WRAPPERS(float, 3); +INSTANTIATE_KERNEL_WRAPPERS(float, 4); +INSTANTIATE_KERNEL_WRAPPERS(float, 5); +INSTANTIATE_KERNEL_WRAPPERS(float, 6); +INSTANTIATE_KERNEL_WRAPPERS(double, 1); +INSTANTIATE_KERNEL_WRAPPERS(double, 2); +INSTANTIATE_KERNEL_WRAPPERS(double, 3); +INSTANTIATE_KERNEL_WRAPPERS(double, 4); +INSTANTIATE_KERNEL_WRAPPERS(double, 5); +INSTANTIATE_KERNEL_WRAPPERS(double, 6); +} // namespace Opm::cuistl::detail::ILU0 diff --git a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp new file mode 100644 index 000000000..e35a38208 --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -0,0 +1,193 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_ILU0_KERNELS_HPP +#define OPM_ILU0_KERNELS_HPP +#include +#include +namespace Opm::cuistl::detail::ILU0 +{ + +/** + * @brief Perform a upper solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param [out] v Will store the results of the upper solve + * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen + */ +template +void solveUpperLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + T* v, + int threadBlockSize); + +/** + * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param d Stores the defect + * @param [out] v Will store the results of the lower solve + * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen + */ +template +void solveLowerLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* d, + T* v, + int threadBlockSize); + +/** + * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered + * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries + * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower + * solve + * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen + */ +template +void solveUpperLevelSetSplit(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v, + int threadBlockSize); + +/** + * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered + * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly lower triangular + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries + * @param d Stores the defect + * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower + * solve + * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen + */ +template +void solveLowerLevelSetSplit(T* reorderedLowerMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* d, + T* v, + int threadBlockSize); + +/** + * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal + * elements store the inverse of the diagonal entries + * @param [out] reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered + * such that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param startIdx Index of the first row of the matrix to be solve + * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen + */ +template +void LUFactorization(T* reorderedMat, + int* rowIndices, + int* columnIndices, + int* naturalToReordered, + int* reorderedToNatual, + size_t rowsInLevelSet, + int startIdx, + int threadBlockSize); + +/** + * @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper + * triangular part) + * @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse + * matrix. The matrix reordered such that rows in the same level sets are contiguous + * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format + * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format + * @param [out] reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse + * matrix. The matrix reordered such that rows in the same level sets are contiguous + * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format + * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format + * @param [out] diagonal The diagonal elements of the ILU0 factorization inverted + * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen + */ +template +void LUFactorizationSplit(T* reorderedLowerMat, + int* lowerRowIndices, + int* lowerColIndices, + T* reorderedUpperMat, + int* upperRowIndices, + int* upperColIndices, + T* diagonal, + int* reorderedToNatural, + int* naturalToReordered, + int startIdx, + int rowsInLevelSet, + int threadBlockSize); + +} // namespace Opm::cuistl::detail::ILU0 +#endif diff --git a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu new file mode 100644 index 000000000..8c9c4c2ce --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu @@ -0,0 +1,88 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +#include +#include +#include +#include +#include + +namespace Opm::cuistl::detail::JAC +{ +namespace +{ + template + __global__ void + cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) + { + const auto row = blockDim.x * blockIdx.x + threadIdx.x; + + if (row < numberOfRows) { + size_t nnzIdx = rowIndices[row]; + size_t nnzIdxLim = rowIndices[row + 1]; + + // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a + // zero element + while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) { + ++nnzIdx; + } + + // diagBlock points to the start of where the diagonal block is stored + T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx]; + // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block + // element should be stored + T* vecBlock = &vec[blocksize * blocksize * row]; + + invBlockOutOfPlace(diagBlock, vecBlock); + } + } +} // namespace + +template +void +invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) +{ + if (blocksize <= 3) { + int threadBlockSize + = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize); + cuInvertDiagonalAndFlatten + <<>>(mat, rowIndices, colIndices, numberOfRows, vec); + } else { + OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); + } +} + +#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ + template void invertDiagonalAndFlatten(T*, int*, int*, size_t, T*); + +INSTANTIATE_KERNEL_WRAPPERS(float, 1); +INSTANTIATE_KERNEL_WRAPPERS(float, 2); +INSTANTIATE_KERNEL_WRAPPERS(float, 3); +INSTANTIATE_KERNEL_WRAPPERS(float, 4); +INSTANTIATE_KERNEL_WRAPPERS(float, 5); +INSTANTIATE_KERNEL_WRAPPERS(float, 6); +INSTANTIATE_KERNEL_WRAPPERS(double, 1); +INSTANTIATE_KERNEL_WRAPPERS(double, 2); +INSTANTIATE_KERNEL_WRAPPERS(double, 3); +INSTANTIATE_KERNEL_WRAPPERS(double, 4); +INSTANTIATE_KERNEL_WRAPPERS(double, 5); +INSTANTIATE_KERNEL_WRAPPERS(double, 6); + +} // namespace Opm::cuistl::detail::JAC diff --git a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp new file mode 100644 index 000000000..9168097a1 --- /dev/null +++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp @@ -0,0 +1,38 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_JAC_KERNELS_HPP +#define OPM_JAC_KERNELS_HPP +#include +#include +namespace Opm::cuistl::detail::JAC +{ + +/** + * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec + * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param numberOfRows Integer describing the number of rows in the matrix + * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored + */ +template +void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec); + +} // namespace Opm::cuistl::detail::JAC +#endif diff --git a/opm/simulators/linalg/cuistl/detail/vector_operations.cu b/opm/simulators/linalg/cuistl/detail/vector_operations.cu index 2ca67f53e..39c17609e 100644 --- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu +++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu @@ -16,14 +16,15 @@ You should have received a copy of the GNU General Public License along with OPM. If not, see . */ +#include #include -#include +#include #include #include #include -#include +#include +#include #include -#include namespace Opm::cuistl::detail { @@ -83,20 +84,8 @@ namespace } } - constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements) - { - return 1024; - } - - inline size_t getBlocks(size_t numberOfElements) - { - const auto threads = getThreads(numberOfElements); - return (numberOfElements + threads - 1) / threads; - } - template - __global__ void - prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices) + __global__ void prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices) { const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x; @@ -105,8 +94,7 @@ namespace } } template - __global__ void - syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices) + __global__ void syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices) { const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x; @@ -116,33 +104,13 @@ namespace } } // namespace -// Kernel here is the function object of the cuda kernel -template -inline int getCudaRecomendedThreadBlockSize(Kernel k) -{ -#if USE_HIP - return 512; -#else - int blockSize; - int tmpGridSize; - std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0); - return blockSize; -#endif -} - -inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize) -{ - return (wantedThreads + threadBlockSize - 1) / threadBlockSize; -} - template void setVectorValue(T* deviceData, size_t numberOfElements, const T& value) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(setVectorValueKernel); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - setVectorValueKernel<<>>( - deviceData, numberOfElements, value); + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + setVectorValueKernel<<>>(deviceData, numberOfElements, value); } template void setVectorValue(double*, size_t, const double&); @@ -153,10 +121,9 @@ template void setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - setZeroAtIndexSetKernel<<>>( - deviceData, numberOfElements, indices); + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + setZeroAtIndexSetKernel<<>>(deviceData, numberOfElements, indices); } template void setZeroAtIndexSet(double*, size_t, const int*); template void setZeroAtIndexSet(float*, size_t, const int*); @@ -164,12 +131,16 @@ template void setZeroAtIndexSet(int*, size_t, const int*); template T -innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices) +innerProductAtIndices(cublasHandle_t cublasHandle, + const T* deviceA, + const T* deviceB, + T* buffer, + size_t numberOfElements, + const int* indices) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - elementWiseMultiplyKernel<<>>( - deviceA, deviceB, buffer, numberOfElements, indices); + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + elementWiseMultiplyKernel<<>>(deviceA, deviceB, buffer, numberOfElements, indices); // TODO: [perf] Get rid of the allocation here. CuVector oneVector(numberOfElements); @@ -184,11 +155,12 @@ template float innerProductAtIndices(cublasHandle_t, const float*, const float*, template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int* buffer, size_t, const int*); template -void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices) +void +prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(prepareSendBufKernel); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - prepareSendBufKernel<<>>(deviceA, buffer, numberOfElements, indices); + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + prepareSendBufKernel<<>>(deviceA, buffer, numberOfElements, indices); OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish. } template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices); @@ -196,12 +168,13 @@ template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberO template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices); template -void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices) +void +syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices) { - int threadBlockSize = getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - syncFromRecvBufKernel<<>>(deviceA, buffer, numberOfElements, indices); - //cudaDeviceSynchronize(); // Not needed, I guess... + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + syncFromRecvBufKernel<<>>(deviceA, buffer, numberOfElements, indices); + // cudaDeviceSynchronize(); // Not needed, I guess... } template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices); template void syncFromRecvBuf(float* deviceA, float* buffer, size_t numberOfElements, const int* indices); @@ -217,30 +190,24 @@ weightedDiagMV(const T* squareBlockVector, T* dstVec) { switch (blocksize) { - case 1: - { - int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - weightedDiagMV<<>>( - squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec); - } - break; - case 2: - { - int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - weightedDiagMV<<>>( - squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec); - } - break; - case 3: - { - int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV); - int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize); - weightedDiagMV<<>>( - squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec); - } - break; + case 1: { + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + weightedDiagMV + <<>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec); + } break; + case 2: { + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + weightedDiagMV + <<>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec); + } break; + case 3: { + int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV); + int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize); + weightedDiagMV + <<>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec); + } break; default: OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3"); break; diff --git a/tests/cuistl/test_cuSparse_matrix_operations.cpp b/tests/cuistl/test_cuSparse_matrix_operations.cpp index d389a298f..fa2e85413 100644 --- a/tests/cuistl/test_cuSparse_matrix_operations.cpp +++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp @@ -28,6 +28,7 @@ #include #include #include +#include using NumericTypes = boost::mpl::list; @@ -87,7 +88,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric Opm::cuistl::CuSparseMatrix m = Opm::cuistl::CuSparseMatrix::fromMatrix(B); Opm::cuistl::CuVector dInvDiag(blocksize * blocksize * N); - Opm::cuistl::detail::invertDiagonalAndFlatten( + Opm::cuistl::detail::JAC::invertDiagonalAndFlatten( m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data()); std::vector expectedInvDiag {-1.0 / 4.0, @@ -161,7 +162,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric Opm::cuistl::CuSparseMatrix m = Opm::cuistl::CuSparseMatrix::fromMatrix(B); Opm::cuistl::CuVector dInvDiag(blocksize * blocksize * N); - Opm::cuistl::detail::invertDiagonalAndFlatten( + Opm::cuistl::detail::JAC::invertDiagonalAndFlatten( m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data()); std::vector expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0}; @@ -171,4 +172,4 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric for (size_t i = 0; i < expectedInvDiag.size(); ++i) { BOOST_CHECK_CLOSE(expectedInvDiag[i], computedInvDiag[i], 1e-7); } -} \ No newline at end of file +}