Add new MP scheme to GPU ILU and DILU

This commit is contained in:
Tobias Meyer Andersen 2024-10-16 15:37:16 +02:00
parent 119282bd6d
commit d1e5a69476
11 changed files with 340 additions and 145 deletions

View File

@ -254,6 +254,7 @@ if (HAVE_CUDA)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/kernel_enums.hpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)

View File

@ -357,10 +357,10 @@ struct StandardPreconditioners {
F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true); const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true); const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false); const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type; using field_type = typename V::field_type;
using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>; using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);
auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU); auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm); auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@ -370,10 +370,10 @@ struct StandardPreconditioners {
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true); const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true); const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false); const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type; using field_type = typename V::field_type;
using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>; using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);
auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0); auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm); auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@ -650,27 +650,27 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true); const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true); const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false); const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type; using field_type = typename V::field_type;
using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>; using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
}); });
F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true); const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true); const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false); const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type; using field_type = typename V::field_type;
using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>; using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
}); });
F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) { F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true); const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true); const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false); const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using block_type = typename V::block_type; using block_type = typename V::block_type;
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>; using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
@ -679,7 +679,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>; using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>; using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
auto converted = std::make_shared<Converter>(op.getmat()); auto converted = std::make_shared<Converter>(op.getmat());
auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
converted->setUnderlyingPreconditioner(adapted); converted->setUnderlyingPreconditioner(adapted);
return converted; return converted;
}); });

View File

@ -41,7 +41,7 @@ namespace Opm::gpuistl
{ {
template <class M, class X, class Y, int l> template <class M, class X, class Y, int l>
GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
: m_cpuMatrix(A) : m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@ -52,8 +52,7 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
, m_splitMatrix(splitMatrix) , m_splitMatrix(splitMatrix)
, m_tuneThreadBlockSizes(tuneKernels) , m_tuneThreadBlockSizes(tuneKernels)
, m_storeFactorizationAsFloat(storeFactorizationAsFloat) , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
{ {
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check // Some sanity check
@ -82,14 +81,16 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
m_cpuMatrix, m_reorderedToNatural); m_cpuMatrix, m_reorderedToNatural);
} }
if (m_storeFactorizationAsFloat) { if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG) {
if (!m_splitMatrix){ if (!m_splitMatrix){
OPM_THROW(std::runtime_error, "Matrix must be split when storing as float."); OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
} }
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
}
} }
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@ -123,8 +124,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
for (int level = 0; level < m_levelSets.size(); ++level) { for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) { if (m_splitMatrix) {
if (m_storeFactorizationAsFloat) { if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>( detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, float>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
@ -135,8 +136,20 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
d.data(), d.data(),
v.data(), v.data(),
lowerSolveThreadBlockSize); lowerSolveThreadBlockSize);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
} else { } else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>( detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -170,7 +183,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel; levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) { if (m_splitMatrix) {
if (m_storeFactorizationAsFloat){ if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>( detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
@ -181,6 +194,17 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInvFloat->data(), m_gpuDInvFloat->data(),
v.data(), v.data(),
upperSolveThreadBlockSize); upperSolveThreadBlockSize);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
} else { } else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>( detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@ -271,8 +295,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
for (int level = 0; level < m_levelSets.size(); ++level) { for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) { if (m_splitMatrix) {
if (m_storeFactorizationAsFloat) { if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>( detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -289,9 +313,27 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize); factorizationBlockSize);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag->data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
nullptr,
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize);
} else { } else {
// TODO: should this be field type twice or field type then float in the template? // TODO: should this be field type twice or field type then float in the template?
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>( detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),

View File

@ -23,6 +23,7 @@
#include <opm/grid/utility/SparseTable.hpp> #include <opm/grid/utility/SparseTable.hpp>
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp> #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp> #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <vector> #include <vector>
@ -62,7 +63,7 @@ public:
//! \param A The matrix to operate on. //! \param A The matrix to operate on.
//! \param w The relaxation factor. //! \param w The relaxation factor.
//! //!
explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);
//! \brief Prepare the preconditioner. //! \brief Prepare the preconditioner.
//! \note Does nothing at the time being. //! \note Does nothing at the time being.
@ -144,8 +145,8 @@ private:
bool m_splitMatrix; bool m_splitMatrix;
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
bool m_tuneThreadBlockSizes; bool m_tuneThreadBlockSizes;
//! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision //! \brief Enum describing how we should store the factorized matrix
bool m_storeFactorizationAsFloat; const MatrixStorageMPScheme m_mixedPrecisionScheme;
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
//! The default value of -1 indicates that we have not calibrated and selected a value yet //! The default value of -1 indicates that we have not calibrated and selected a value yet
int m_upperSolveThreadBlockSize = -1; int m_upperSolveThreadBlockSize = -1;

View File

@ -41,7 +41,7 @@ namespace Opm::gpuistl
{ {
template <class M, class X, class Y, int l> template <class M, class X, class Y, int l>
OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
: m_cpuMatrix(A) : m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@ -56,8 +56,9 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
, m_splitMatrix(splitMatrix) , m_splitMatrix(splitMatrix)
, m_tuneThreadBlockSizes(tuneKernels) , m_tuneThreadBlockSizes(tuneKernels)
, m_storeFactorizationAsFloat(storeFactorizationAsFloat) , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
{ {
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check // Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@ -85,13 +86,16 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
m_cpuMatrix, m_reorderedToNatural); m_cpuMatrix, m_reorderedToNatural);
} }
if (m_storeFactorizationAsFloat){ if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG){
OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true"); OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
// initialize mixed precision datastructures // initialize mixed precision datastructures
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); // The MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG does not need to allocate this float vector
if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
}
} }
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
@ -128,7 +132,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
for (int level = 0; level < m_levelSets.size(); ++level) { for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) { if (m_splitMatrix) {
if (m_storeFactorizationAsFloat){ if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG) {
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>( detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
@ -171,8 +175,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel; levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) { if (m_splitMatrix) {
if (m_storeFactorizationAsFloat) { if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>( detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
@ -183,8 +187,20 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
v.data(), v.data(),
upperSolveThreadBlockSize); upperSolveThreadBlockSize);
} }
else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
upperSolveThreadBlockSize);
}
else{ else{
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>( detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@ -272,8 +288,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) { if (m_splitMatrix) {
if (m_storeFactorizationAsFloat){ if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG){
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>( detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -290,8 +306,26 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
numOfRowsInLevel, numOfRowsInLevel,
factorizationThreadBlockSize); factorizationThreadBlockSize);
} }
else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
nullptr,
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
factorizationThreadBlockSize);
}
else{ else{
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>( detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),

View File

@ -24,6 +24,7 @@
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp> #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp> #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/GpuVector.hpp> #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <optional> #include <optional>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
@ -64,7 +65,7 @@ public:
//! \param A The matrix to operate on. //! \param A The matrix to operate on.
//! \param w The relaxation factor. //! \param w The relaxation factor.
//! //!
explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);
//! \brief Prepare the preconditioner. //! \brief Prepare the preconditioner.
//! \note Does nothing at the time being. //! \note Does nothing at the time being.
@ -143,9 +144,8 @@ private:
bool m_splitMatrix; bool m_splitMatrix;
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
bool m_tuneThreadBlockSizes; bool m_tuneThreadBlockSizes;
//! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure. //! \brief Enum storing how we should store the factorized matrix
//! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed. const MatrixStorageMPScheme m_mixedPrecisionScheme;
bool m_storeFactorizationAsFloat;
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
//! The default value of -1 indicates that we have not calibrated and selected a value yet //! The default value of -1 indicates that we have not calibrated and selected a value yet
int m_upperSolveThreadBlockSize = -1; int m_upperSolveThreadBlockSize = -1;

View File

@ -0,0 +1,109 @@
/*
Copyright 2024 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_GPUISTL_KERNEL_ENUMS_HPP
#define OPM_GPUISTL_KERNEL_ENUMS_HPP
#include <cuda_runtime.h>
/*
This file organizes a growing amount of different mixed precision options for the preconditioners.
*/
namespace Opm::gpuistl {
// Mixed precision schemes used for storing the matrix in GPU memory
enum class MatrixStorageMPScheme {
DOUBLE_DIAG_DOUBLE_OFFDIAG = 0, // full precision should be default
FLOAT_DIAG_FLOAT_OFFDIAG = 1,
DOUBLE_DIAG_FLOAT_OFFDIAG = 2
};
namespace detail {
bool isValidMatrixStorageMPScheme(int scheme);
}
inline MatrixStorageMPScheme makeMatrixStorageMPScheme(int scheme) {
if (!detail::isValidMatrixStorageMPScheme(scheme)) {
OPM_THROW(std::invalid_argument,
fmt::format("Invalid matrix storage mixed precision scheme chosen: {}.\n"
"Valid Schemes:\n"
"\t0: DOUBLE_DIAG_DOUBLE_OFFDIAG\n"
"\t1: FLOAT_DIAG_FLOAT_OFFDIAG\n"
"\t2: DOUBLE_DIAG_FLOAT_OFFDIAG",
scheme));
}
return static_cast<MatrixStorageMPScheme>(scheme);
}
namespace detail {
__host__ __device__ constexpr bool storeDiagonalAsFloat(MatrixStorageMPScheme scheme) {
switch (scheme) {
case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
return false;
case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
return true;
case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
return false;
default:
return false;
}
}
__host__ __device__ constexpr bool storeOffDiagonalAsFloat(MatrixStorageMPScheme scheme) {
switch (scheme) {
case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
return false;
case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
return true;
case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
return true;
default:
return false;
}
}
// returns true if we use anything else that the the default double precision for everything
__host__ __device__ constexpr bool usingMixedPrecision(MatrixStorageMPScheme scheme) {
switch (scheme) {
case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
return false;
case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
return true;
case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
return true;
default:
return false;
}
}
inline bool isValidMatrixStorageMPScheme(int scheme) {
switch (static_cast<MatrixStorageMPScheme>(scheme)) {
case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
return true;
default:
return false;
}
}
}
}
#endif // OPM_GPUISTL_KERNEL_ENUMS_HPP

View File

@ -59,14 +59,14 @@ namespace
} }
} }
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
__global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat, __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
const LinearSolverScalar* d, const LinearSolverScalar* d,
LinearSolverScalar* v) LinearSolverScalar* v)
{ {
@ -88,7 +88,7 @@ namespace
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
} }
mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
} }
} }
@ -118,14 +118,14 @@ namespace
} }
} }
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
__global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
LinearSolverScalar* v) LinearSolverScalar* v)
{ {
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
@ -140,7 +140,7 @@ namespace
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
} }
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
} }
} }
@ -213,7 +213,7 @@ namespace
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
// TOOD: The important part is to only cast after that is fully computed // TOOD: The important part is to only cast after that is fully computed
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix> template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
__global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
int* lowerColIndices, int* lowerColIndices,
@ -255,7 +255,7 @@ namespace
const int symOppositeBlock = symOppositeIdx; const int symOppositeBlock = symOppositeIdx;
if constexpr (copyResultToOtherMatrix) { if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
// TODO: think long and hard about whether this performs only the wanted memory transfers // TODO: think long and hard about whether this performs only the wanted memory transfers
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]); moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]); moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
@ -268,14 +268,9 @@ namespace
} }
invBlockInPlace<InputScalar, blocksize>(dInvTmp); invBlockInPlace<InputScalar, blocksize>(dInvTmp);
// for (int i = 0; i < blocksize; ++i) {
// for (int j = 0; j < blocksize; ++j) {
// dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
// }
// }
moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]); moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
if constexpr (copyResultToOtherMatrix) {
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
} }
} }
@ -304,7 +299,7 @@ solveLowerLevelSet(T* reorderedMat,
} }
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
void void
solveLowerLevelSetSplit(MatrixScalar* reorderedMat, solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices, int* rowIndices,
@ -312,15 +307,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
const LinearSolverScalar* d, const LinearSolverScalar* d,
LinearSolverScalar* v, LinearSolverScalar* v,
int thrBlockSize) int thrBlockSize)
{ {
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize); cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>( cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
} }
// perform the upper solve for all rows in the same level set // perform the upper solve for all rows in the same level set
@ -343,7 +338,7 @@ solveUpperLevelSet(T* reorderedMat,
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
} }
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
void void
solveUpperLevelSetSplit(MatrixScalar* reorderedMat, solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices, int* rowIndices,
@ -351,14 +346,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
LinearSolverScalar* v, LinearSolverScalar* v,
int thrBlockSize) int thrBlockSize)
{ {
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize); cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>( cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
} }
@ -391,7 +386,7 @@ computeDiluDiagonal(T* reorderedMat,
} }
} }
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix> template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme scheme>
void void
computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
@ -412,9 +407,9 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
{ {
if (blocksize <= 3) { if (blocksize <= 3) {
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize); cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat, cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices, lowerRowIndices,
lowerColIndices, lowerColIndices,
srcReorderedUpperMat, srcReorderedUpperMat,
@ -437,13 +432,17 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
// TODO: format // TODO: format
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \ template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, false>( \ template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, false>( \ template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, true>( \ template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, true>( \ template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
@ -461,29 +460,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6); INSTANTIATE_KERNEL_WRAPPERS(double, 6);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar) \ #define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>( \ template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int); \ MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>( \ template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float); // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float); #define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float); INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float);
} // namespace Opm::gpuistl::detail::DILU } // namespace Opm::gpuistl::detail::DILU

View File

@ -22,6 +22,7 @@
#include <cstddef> #include <cstddef>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <vector> #include <vector>
namespace Opm::gpuistl::detail::DILU namespace Opm::gpuistl::detail::DILU
@ -71,14 +72,14 @@ void solveLowerLevelSet(T* reorderedMat,
* @param d Stores the defect * @param d Stores the defect
* @param [out] v Will store the results of the lower solve * @param [out] v Will store the results of the lower solve
*/ */
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat, void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
const LinearSolverScalar* d, const LinearSolverScalar* d,
LinearSolverScalar* v, LinearSolverScalar* v,
int threadBlockSize); int threadBlockSize);
@ -124,14 +125,14 @@ void solveUpperLevelSet(T* reorderedMat,
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve * solve
*/ */
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat, void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
LinearSolverScalar* v, LinearSolverScalar* v,
int threadBlockSize); int threadBlockSize);
@ -183,7 +184,7 @@ void computeDiluDiagonal(T* reorderedMat,
* function * function
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
*/ */
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix> template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme>
void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
int* lowerColIndices, int* lowerColIndices,

View File

@ -120,7 +120,7 @@ namespace
} }
} }
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix> template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
__global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat, __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
int* lowerColIndices, int* lowerColIndices,
@ -161,7 +161,7 @@ namespace
mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock], mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
&srcDiagonal[j * scalarsInBlock], &srcDiagonal[j * scalarsInBlock],
&srcReorderedLowerMat[ij * scalarsInBlock]); &srcReorderedLowerMat[ij * scalarsInBlock]);
if (copyResultToOtherMatrix) { if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock], moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock],
&dstReorderedLowerMat[ij * scalarsInBlock]); &dstReorderedLowerMat[ij * scalarsInBlock]);
} }
@ -180,22 +180,15 @@ namespace
while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) { while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
InputScalar* ikBlockPtr; InputScalar* ikBlockPtr;
OutputScalar* dstIkBlockPtr;
int ikColumn; int ikColumn;
if (ikState == POSITION_TYPE::UNDER_DIAG) { if (ikState == POSITION_TYPE::UNDER_DIAG) {
ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock]; ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock];
if (copyResultToOtherMatrix)
dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock];
ikColumn = lowerColIndices[ik]; ikColumn = lowerColIndices[ik];
} else if (ikState == POSITION_TYPE::ON_DIAG) { } else if (ikState == POSITION_TYPE::ON_DIAG) {
ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock]; ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock];
if (copyResultToOtherMatrix)
dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock];
ikColumn = naturalIdx; ikColumn = naturalIdx;
} else { // ikState == POSITION_TYPE::ABOVE_DIAG } else { // ikState == POSITION_TYPE::ABOVE_DIAG
ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock]; ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock];
if (copyResultToOtherMatrix)
dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock];
ikColumn = upperColIndices[ik]; ikColumn = upperColIndices[ik];
} }
@ -219,10 +212,15 @@ namespace
} }
} }
invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]); invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]);
if (copyResultToOtherMatrix) { // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock], // and sometimes also the diagonal.
&dstDiagonal[reorderedIdx * scalarsInBlock]); if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
// if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
// if not then we just use the double diagonal that is already now stored in srcDiagonal
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
&dstDiagonal[reorderedIdx * scalarsInBlock]);
}
// also move all values above the diagonal on this row // also move all values above the diagonal on this row
for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) { for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock], moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock],
@ -321,14 +319,14 @@ namespace
} }
} }
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
__global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
LinearSolverScalar* v) LinearSolverScalar* v)
{ {
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
@ -348,7 +346,7 @@ namespace
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
} }
mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>( mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); &dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
} }
} }
@ -410,7 +408,7 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
} }
// perform the upper solve for all rows in the same level set // perform the upper solve for all rows in the same level set
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
void void
solveUpperLevelSetSplit(MatrixScalar* reorderedMat, solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices, int* rowIndices,
@ -418,14 +416,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
LinearSolverScalar* v, LinearSolverScalar* v,
int thrBlockSize) int thrBlockSize)
{ {
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize); cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>( cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
} }
@ -447,7 +445,7 @@ LUFactorization(T* srcMatrix,
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
} }
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix> template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
void void
LUFactorizationSplit(InputScalar* srcReorderedLowerMat, LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
@ -466,9 +464,9 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int thrBlockSize) int thrBlockSize)
{ {
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize); cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix> cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>
<<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat, <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices, lowerRowIndices,
lowerColIndices, lowerColIndices,
@ -489,27 +487,29 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \ template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \ template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
template void LUFactorizationSplit<blocksize, T, float, true>( \ template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, true>( \ template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, false>( \ template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, false>( \ template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1); #define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
INSTANTIATE_KERNEL_WRAPPERS(float, 2); INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
INSTANTIATE_KERNEL_WRAPPERS(float, 3); INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
INSTANTIATE_KERNEL_WRAPPERS(float, 4); INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
INSTANTIATE_KERNEL_WRAPPERS(float, 5); INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
INSTANTIATE_KERNEL_WRAPPERS(float, 6); INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
INSTANTIATE_KERNEL_WRAPPERS(double, 1); INSTANTIATE_KERNEL_WRAPPERS(T, 6);
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
INSTANTIATE_KERNEL_WRAPPERS(double, 3); INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
/* double preconditioner */ \ /* double preconditioner */ \
@ -523,15 +523,25 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);
float*, int*, int*, int*, int, int, const float*, float*, int); \ float*, int*, int*, int*, int, int, const float*, float*, int); \
\ \
/* double preconditioner */ \ /* double preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, double>( \ template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int); \ double*, int*, int*, int*, int, int, const double*, double*, int); \
/* float matrix, double compute preconditioner */ \ /* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float>( \ template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
float*, int*, int*, int*, int, int, const double*, double*, int); \
/* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
float*, int*, int*, int*, int, int, const double*, float*, int); \
/* double preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
double*, int*, int*, int*, int, int, const float*, double*, int); \
/* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
float*, int*, int*, int*, int, int, const float*, double*, int); \ float*, int*, int*, int*, int, int, const float*, double*, int); \
/* float preconditioner */ \ /* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float>( \ template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int); float*, int*, int*, int*, int, int, const float*, float*, int);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3);

View File

@ -20,6 +20,7 @@
#define OPM_ILU0_KERNELS_HPP #define OPM_ILU0_KERNELS_HPP
#include <cstddef> #include <cstddef>
#include <vector> #include <vector>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
namespace Opm::gpuistl::detail::ILU0 namespace Opm::gpuistl::detail::ILU0
{ {
@ -89,14 +90,14 @@ void solveLowerLevelSet(T* reorderedMat,
* solve * solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/ */
template <int blocksize, class LinearSolverScalar, class MatrixScalar> template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const MatrixScalar* dInv, const DiagonalScalar* dInv,
LinearSolverScalar* v, LinearSolverScalar* v,
int threadBlockSize); int threadBlockSize);
@ -176,7 +177,7 @@ void LUFactorization(T* reorderedMat,
* function * function
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/ */
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix> template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
int* lowerColIndices, int* lowerColIndices,