mirror of
https://github.com/OPM/opm-simulators.git
synced 2024-11-22 09:16:27 -06:00
Implement mixed precision GPU ILU0
This commit is contained in:
parent
9c74c4d638
commit
55c20dbddd
@ -361,9 +361,10 @@ struct StandardPreconditioners {
|
|||||||
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||||
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
|
||||||
using field_type = typename V::field_type;
|
using field_type = typename V::field_type;
|
||||||
using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
|
using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
|
||||||
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);
|
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
|
||||||
|
|
||||||
auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
|
auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
|
||||||
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||||
@ -619,10 +620,12 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
|||||||
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
|
||||||
|
|
||||||
using field_type = typename V::field_type;
|
using field_type = typename V::field_type;
|
||||||
using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
|
using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
|
||||||
|
|
||||||
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
|
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
|
||||||
});
|
});
|
||||||
|
|
||||||
F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
@ -636,6 +639,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
|||||||
F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
|
||||||
using block_type = typename V::block_type;
|
using block_type = typename V::block_type;
|
||||||
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
||||||
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
||||||
|
@ -81,6 +81,21 @@ GpuSparseMatrix<T>::GpuSparseMatrix(const T* nonZeroElements,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int> rowIndices,
|
||||||
|
const GpuVector<int> columnIndices,
|
||||||
|
size_t blockSize)
|
||||||
|
: m_nonZeroElements(columnIndices.dim() * blockSize * blockSize)
|
||||||
|
, m_columnIndices(columnIndices)
|
||||||
|
, m_rowIndices(rowIndices)
|
||||||
|
, m_numberOfNonzeroBlocks(detail::to_int(columnIndices.dim()))
|
||||||
|
, m_numberOfRows(detail::to_int(rowIndices.dim()-1))
|
||||||
|
, m_blockSize(detail::to_int(blockSize))
|
||||||
|
, m_matrixDescription(detail::createMatrixDescription())
|
||||||
|
, m_cusparseHandle(detail::CuSparseHandle::getInstance())
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
GpuSparseMatrix<T>::~GpuSparseMatrix()
|
GpuSparseMatrix<T>::~GpuSparseMatrix()
|
||||||
{
|
{
|
||||||
|
@ -67,6 +67,20 @@ public:
|
|||||||
size_t blockSize,
|
size_t blockSize,
|
||||||
size_t numberOfRows);
|
size_t numberOfRows);
|
||||||
|
|
||||||
|
//! Create a sparse matrix by copying the sparsity structure of another matrix, not filling in the values
|
||||||
|
//!
|
||||||
|
//! \note Prefer to use the constructor taking a const reference to a matrix instead.
|
||||||
|
//!
|
||||||
|
//! \param[in] rowIndices the row indices of the non-zero elements
|
||||||
|
//! \param[in] columnIndices the column indices of the non-zero elements
|
||||||
|
//! \param[in] blockSize size of each block matrix (typically 3)
|
||||||
|
//!
|
||||||
|
//! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to
|
||||||
|
//! restrictions in the current version of cusparse. This might change in future versions.
|
||||||
|
GpuSparseMatrix(const GpuVector<int> rowIndices,
|
||||||
|
const GpuVector<int> columnIndices,
|
||||||
|
size_t blockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* We don't want to be able to copy this for now (too much hassle in copying the cusparse resources)
|
* We don't want to be able to copy this for now (too much hassle in copying the cusparse resources)
|
||||||
*/
|
*/
|
||||||
|
@ -41,7 +41,7 @@ namespace Opm::gpuistl
|
|||||||
{
|
{
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels)
|
OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
|
||||||
: m_cpuMatrix(A)
|
: m_cpuMatrix(A)
|
||||||
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
||||||
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
|
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
|
||||||
@ -49,11 +49,14 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
|||||||
, m_gpuMatrix(GpuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
|
, m_gpuMatrix(GpuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
|
||||||
, m_gpuMatrixReorderedLower(nullptr)
|
, m_gpuMatrixReorderedLower(nullptr)
|
||||||
, m_gpuMatrixReorderedUpper(nullptr)
|
, m_gpuMatrixReorderedUpper(nullptr)
|
||||||
|
, m_gpuMatrixReorderedLowerFloat(nullptr)
|
||||||
|
, m_gpuMatrixReorderedUpperFloat(nullptr)
|
||||||
, m_gpuNaturalToReorder(m_naturalToReordered)
|
, m_gpuNaturalToReorder(m_naturalToReordered)
|
||||||
, m_gpuReorderToNatural(m_reorderedToNatural)
|
, m_gpuReorderToNatural(m_reorderedToNatural)
|
||||||
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
|
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
|
||||||
, m_splitMatrix(splitMatrix)
|
, m_splitMatrix(splitMatrix)
|
||||||
, m_tuneThreadBlockSizes(tuneKernels)
|
, m_tuneThreadBlockSizes(tuneKernels)
|
||||||
|
, m_storeFactorizationAsFloat(storeFactorizationAsFloat)
|
||||||
{
|
{
|
||||||
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
||||||
// Some sanity check
|
// Some sanity check
|
||||||
@ -71,6 +74,7 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
|||||||
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
|
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
|
||||||
m_gpuMatrix.nonzeroes(),
|
m_gpuMatrix.nonzeroes(),
|
||||||
A.nonzeroes()));
|
A.nonzeroes()));
|
||||||
|
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
m_gpuMatrixReorderedDiag.emplace(GpuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
|
m_gpuMatrixReorderedDiag.emplace(GpuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
|
||||||
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
||||||
@ -80,6 +84,15 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
|||||||
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, GpuSparseMatrix<field_type>>(
|
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, GpuSparseMatrix<field_type>>(
|
||||||
m_cpuMatrix, m_reorderedToNatural);
|
m_cpuMatrix, m_reorderedToNatural);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m_storeFactorizationAsFloat){
|
||||||
|
assert(m_splitMatrix && "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
|
||||||
|
// initialize mixed precision datastructures
|
||||||
|
m_gpuMatrixReorderedLowerFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_)));
|
||||||
|
m_gpuMatrixReorderedUpperFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_)));
|
||||||
|
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
|
||||||
|
}
|
||||||
|
|
||||||
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||||
|
|
||||||
if (m_tuneThreadBlockSizes) {
|
if (m_tuneThreadBlockSizes) {
|
||||||
@ -107,20 +120,37 @@ template <class M, class X, class Y, int l>
|
|||||||
void
|
void
|
||||||
OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
|
OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
|
||||||
{
|
{
|
||||||
|
// perform a lower solve and then an upper solve to apply the approximate inverse using ILU factorization
|
||||||
|
// for the lower and upper solve we have some if's that determine which underlying implementation to use
|
||||||
|
|
||||||
int levelStartIdx = 0;
|
int levelStartIdx = 0;
|
||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
|
if (m_storeFactorizationAsFloat){
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
|
||||||
levelStartIdx,
|
m_gpuReorderToNatural.data(),
|
||||||
numOfRowsInLevel,
|
levelStartIdx,
|
||||||
d.data(),
|
numOfRowsInLevel,
|
||||||
v.data(),
|
d.data(),
|
||||||
lowerSolveThreadBlockSize);
|
v.data(),
|
||||||
|
lowerSolveThreadBlockSize);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
|
||||||
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
d.data(),
|
||||||
|
v.data(),
|
||||||
|
lowerSolveThreadBlockSize);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
m_gpuReorderedLU->getRowIndices().data(),
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
@ -140,16 +170,30 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
|||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
levelStartIdx -= numOfRowsInLevel;
|
levelStartIdx -= numOfRowsInLevel;
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
|
if (m_storeFactorizationAsFloat) {
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>(
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
|
||||||
levelStartIdx,
|
m_gpuReorderToNatural.data(),
|
||||||
numOfRowsInLevel,
|
levelStartIdx,
|
||||||
m_gpuMatrixReorderedDiag.value().data(),
|
numOfRowsInLevel,
|
||||||
v.data(),
|
m_gpuMatrixReorderedDiagFloat.value().data(),
|
||||||
upperSolveThreadBlockSize);
|
v.data(),
|
||||||
|
upperSolveThreadBlockSize);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
v.data(),
|
||||||
|
upperSolveThreadBlockSize);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
m_gpuReorderedLU->getRowIndices().data(),
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
@ -227,7 +271,7 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
|
|||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
|
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float>(
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
@ -235,10 +279,14 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
|
|||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
m_gpuMatrixReorderedDiag.value().data(),
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data() : nullptr,
|
||||||
|
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data() : nullptr,
|
||||||
|
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedDiagFloat.value().data() : nullptr,
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuReorderToNatural.data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
levelStartIdx,
|
levelStartIdx,
|
||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
|
m_storeFactorizationAsFloat,
|
||||||
factorizationThreadBlockSize);
|
factorizationThreadBlockSize);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -54,7 +54,9 @@ public:
|
|||||||
//! \brief The field type of the preconditioner.
|
//! \brief The field type of the preconditioner.
|
||||||
using field_type = typename X::field_type;
|
using field_type = typename X::field_type;
|
||||||
//! \brief The GPU matrix type
|
//! \brief The GPU matrix type
|
||||||
using CuMat = GpuSparseMatrix<field_type>;
|
using GpuMat = GpuSparseMatrix<field_type>;
|
||||||
|
//! \brief The Float matrix type for mixed precision
|
||||||
|
using FloatMat = GpuSparseMatrix<float>;
|
||||||
|
|
||||||
//! \brief Constructor.
|
//! \brief Constructor.
|
||||||
//!
|
//!
|
||||||
@ -62,7 +64,7 @@ public:
|
|||||||
//! \param A The matrix to operate on.
|
//! \param A The matrix to operate on.
|
||||||
//! \param w The relaxation factor.
|
//! \param w The relaxation factor.
|
||||||
//!
|
//!
|
||||||
explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels);
|
explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
|
||||||
|
|
||||||
//! \brief Prepare the preconditioner.
|
//! \brief Prepare the preconditioner.
|
||||||
//! \note Does nothing at the time being.
|
//! \note Does nothing at the time being.
|
||||||
@ -120,11 +122,15 @@ private:
|
|||||||
//! \brief converts from index in natural ordered structure to index reordered strucutre
|
//! \brief converts from index in natural ordered structure to index reordered strucutre
|
||||||
std::vector<int> m_naturalToReordered;
|
std::vector<int> m_naturalToReordered;
|
||||||
//! \brief The A matrix stored on the gpu, and its reordred version
|
//! \brief The A matrix stored on the gpu, and its reordred version
|
||||||
CuMat m_gpuMatrix;
|
GpuMat m_gpuMatrix;
|
||||||
std::unique_ptr<CuMat> m_gpuReorderedLU;
|
std::unique_ptr<GpuMat> m_gpuReorderedLU;
|
||||||
//! \brief If matrix splitting is enabled, then we store the lower and upper part separately
|
//! \brief If matrix splitting is enabled, then we store the lower and upper part separately
|
||||||
std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
|
std::unique_ptr<GpuMat> m_gpuMatrixReorderedLower;
|
||||||
std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
|
std::unique_ptr<GpuMat> m_gpuMatrixReorderedUpper;
|
||||||
|
//! \brief If mixed precision is enabled, store a float matrix
|
||||||
|
std::unique_ptr<FloatMat> m_gpuMatrixReorderedLowerFloat;
|
||||||
|
std::unique_ptr<FloatMat> m_gpuMatrixReorderedUpperFloat;
|
||||||
|
std::optional<GpuVector<float>> m_gpuMatrixReorderedDiagFloat;
|
||||||
//! \brief If matrix splitting is enabled, we also store the diagonal separately
|
//! \brief If matrix splitting is enabled, we also store the diagonal separately
|
||||||
std::optional<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
|
std::optional<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
|
||||||
//! row conversion from natural to reordered matrix indices stored on the GPU
|
//! row conversion from natural to reordered matrix indices stored on the GPU
|
||||||
@ -137,6 +143,9 @@ private:
|
|||||||
bool m_splitMatrix;
|
bool m_splitMatrix;
|
||||||
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
|
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
|
||||||
bool m_tuneThreadBlockSizes;
|
bool m_tuneThreadBlockSizes;
|
||||||
|
//! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure.
|
||||||
|
//! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed.
|
||||||
|
bool m_storeFactorizationAsFloat;
|
||||||
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||||
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||||
int m_upperSolveThreadBlockSize = -1;
|
int m_upperSolveThreadBlockSize = -1;
|
||||||
|
@ -37,9 +37,9 @@ __device__ __forceinline__ void
|
|||||||
invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
|
invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
|
||||||
{
|
{
|
||||||
if (blocksize == 1) {
|
if (blocksize == 1) {
|
||||||
dstBlock[0] = 1.0 / (srcBlock[0]);
|
dstBlock[0] = T(1.0) / (srcBlock[0]);
|
||||||
} else if (blocksize == 2) {
|
} else if (blocksize == 2) {
|
||||||
T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
|
T detInv = T(1.0) / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
|
||||||
dstBlock[0] = srcBlock[3] * detInv;
|
dstBlock[0] = srcBlock[3] * detInv;
|
||||||
dstBlock[1] = -srcBlock[1] * detInv;
|
dstBlock[1] = -srcBlock[1] * detInv;
|
||||||
dstBlock[2] = -srcBlock[2] * detInv;
|
dstBlock[2] = -srcBlock[2] * detInv;
|
||||||
@ -53,7 +53,7 @@ invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
|
|||||||
T t12 = srcBlock[1] * srcBlock[6];
|
T t12 = srcBlock[1] * srcBlock[6];
|
||||||
T t14 = srcBlock[2] * srcBlock[6];
|
T t14 = srcBlock[2] * srcBlock[6];
|
||||||
|
|
||||||
T t17 = 1.0
|
T t17 = T(1.0)
|
||||||
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
|
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
|
||||||
- t14 * srcBlock[4]); // t17 is 1/determinant
|
- t14 * srcBlock[4]); // t17 is 1/determinant
|
||||||
|
|
||||||
@ -75,9 +75,9 @@ __device__ __forceinline__ void
|
|||||||
invBlockInPlace(T* __restrict__ block)
|
invBlockInPlace(T* __restrict__ block)
|
||||||
{
|
{
|
||||||
if (blocksize == 1) {
|
if (blocksize == 1) {
|
||||||
block[0] = 1.0 / (block[0]);
|
block[0] = T(1.0) / (block[0]);
|
||||||
} else if (blocksize == 2) {
|
} else if (blocksize == 2) {
|
||||||
T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
|
T detInv = T(1.0) / (block[0] * block[3] - block[1] * block[2]);
|
||||||
|
|
||||||
T temp = block[0];
|
T temp = block[0];
|
||||||
block[0] = block[3] * detInv;
|
block[0] = block[3] * detInv;
|
||||||
@ -93,7 +93,7 @@ invBlockInPlace(T* __restrict__ block)
|
|||||||
T t14 = block[2] * block[6];
|
T t14 = block[2] * block[6];
|
||||||
|
|
||||||
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
|
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
|
||||||
T t17 = T(1.0) / det;
|
T t17 = T(T(1.0)) / det;
|
||||||
|
|
||||||
T matrix01 = block[1];
|
T matrix01 = block[1];
|
||||||
T matrix00 = block[0];
|
T matrix00 = block[0];
|
||||||
@ -229,6 +229,46 @@ matrixSubtraction(T* A, T* B)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: possibly place somewhere else in this file
|
||||||
|
// TODO: consider merging with existing block operations
|
||||||
|
template<int blocksize, class ScalarInputType, class ScalarOutputType>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
moveBlock(const ScalarInputType* __restrict__ A, ScalarOutputType* __restrict__ B){
|
||||||
|
for (int i = 0; i < blocksize; ++i){
|
||||||
|
for (int j = 0; j < blocksize; ++j){
|
||||||
|
B[i * blocksize + j] = ScalarOutputType(A[i * blocksize + j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: possibly place somewhere else in this file
|
||||||
|
// TODO: consider merging with existing block operations
|
||||||
|
template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
c[i] = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: possibly place somewhere else in this file
|
||||||
|
// TODO: consider merging with existing block operations
|
||||||
|
template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mmvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
c[i] -= ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -120,17 +120,21 @@ namespace
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class InputScalar, class OutputScalar>
|
||||||
__global__ void cuLUFactorizationSplit(T* reorderedLowerMat,
|
__global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||||
int* lowerRowIndices,
|
int* lowerRowIndices,
|
||||||
int* lowerColIndices,
|
int* lowerColIndices,
|
||||||
T* reorderedUpperMat,
|
InputScalar* srcReorderedUpperMat,
|
||||||
int* upperRowIndices,
|
int* upperRowIndices,
|
||||||
int* upperColIndices,
|
int* upperColIndices,
|
||||||
T* diagonal,
|
InputScalar* srcDiagonal,
|
||||||
|
OutputScalar* dstReorderedLowerMat,
|
||||||
|
OutputScalar* dstReorderedUpperMat,
|
||||||
|
OutputScalar* dstDiagonal,
|
||||||
int* reorderedToNatural,
|
int* reorderedToNatural,
|
||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
const int startIdx,
|
const int startIdx,
|
||||||
|
const bool copyResultToOtherMatrix,
|
||||||
int rowsInLevelSet)
|
int rowsInLevelSet)
|
||||||
{
|
{
|
||||||
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
@ -155,9 +159,12 @@ namespace
|
|||||||
|
|
||||||
// the DUNE implementation is inplace, and I need to take care to make sure
|
// the DUNE implementation is inplace, and I need to take care to make sure
|
||||||
// this out of place implementation is equivalent
|
// this out of place implementation is equivalent
|
||||||
mmOverlap<T, blocksize>(&reorderedLowerMat[ij * scalarsInBlock],
|
mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
|
||||||
&diagonal[j * scalarsInBlock],
|
&srcDiagonal[j * scalarsInBlock],
|
||||||
&reorderedLowerMat[ij * scalarsInBlock]);
|
&srcReorderedLowerMat[ij * scalarsInBlock]);
|
||||||
|
if (copyResultToOtherMatrix){
|
||||||
|
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock], &dstReorderedLowerMat[ij * scalarsInBlock]);
|
||||||
|
}
|
||||||
|
|
||||||
// we have now accounted for an element under the diagonal and the diagonal element above it
|
// we have now accounted for an element under the diagonal and the diagonal element above it
|
||||||
// now iterate over the blocks that align in the
|
// now iterate over the blocks that align in the
|
||||||
@ -172,40 +179,53 @@ namespace
|
|||||||
// the same block index might be found in the lower part of the matrix
|
// the same block index might be found in the lower part of the matrix
|
||||||
while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
|
while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
|
||||||
|
|
||||||
T* ikBlockPtr;
|
InputScalar* ikBlockPtr;
|
||||||
|
OutputScalar* dstIkBlockPtr;
|
||||||
int ikColumn;
|
int ikColumn;
|
||||||
if (ikState == POSITION_TYPE::UNDER_DIAG) {
|
if (ikState == POSITION_TYPE::UNDER_DIAG) {
|
||||||
ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock];
|
ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock];
|
||||||
|
if (copyResultToOtherMatrix)
|
||||||
|
dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock];
|
||||||
ikColumn = lowerColIndices[ik];
|
ikColumn = lowerColIndices[ik];
|
||||||
} else if (ikState == POSITION_TYPE::ON_DIAG) {
|
} else if (ikState == POSITION_TYPE::ON_DIAG) {
|
||||||
ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock];
|
ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock];
|
||||||
|
if (copyResultToOtherMatrix)
|
||||||
|
dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock];
|
||||||
ikColumn = naturalIdx;
|
ikColumn = naturalIdx;
|
||||||
} else { // ikState == POSITION_TYPE::ABOVE_DIAG
|
} else { // ikState == POSITION_TYPE::ABOVE_DIAG
|
||||||
ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock];
|
ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock];
|
||||||
|
if (copyResultToOtherMatrix)
|
||||||
|
dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock];
|
||||||
ikColumn = upperColIndices[ik];
|
ikColumn = upperColIndices[ik];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ikColumn == upperColIndices[jk]) {
|
if (ikColumn == upperColIndices[jk]) {
|
||||||
// A_jk = A_ij * A_jk
|
// A_jk = A_ij * A_jk
|
||||||
T tmp[scalarsInBlock] = {0};
|
InputScalar tmp[scalarsInBlock] = {0};
|
||||||
|
|
||||||
mmNoOverlap<T, blocksize>(
|
mmNoOverlap<InputScalar, blocksize>(
|
||||||
&reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp);
|
&srcReorderedLowerMat[ij * scalarsInBlock], &srcReorderedUpperMat[jk * scalarsInBlock], tmp);
|
||||||
matrixSubtraction<T, blocksize>(ikBlockPtr, tmp);
|
matrixSubtraction<InputScalar, blocksize>(ikBlockPtr, tmp);
|
||||||
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
||||||
;
|
|
||||||
++jk;
|
++jk;
|
||||||
} else {
|
} else {
|
||||||
if (ikColumn < upperColIndices[jk]) {
|
if (ikColumn < upperColIndices[jk]) {
|
||||||
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
||||||
;
|
|
||||||
} else {
|
} else {
|
||||||
++jk;
|
++jk;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
invBlockInPlace<T, blocksize>(&diagonal[reorderedIdx * scalarsInBlock]);
|
invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]);
|
||||||
|
if (copyResultToOtherMatrix){
|
||||||
|
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock], &dstDiagonal[reorderedIdx * scalarsInBlock]);
|
||||||
|
|
||||||
|
// also move all values above the diagonal on this row
|
||||||
|
for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) {
|
||||||
|
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock], &dstReorderedUpperMat[block * scalarsInBlock]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,15 +286,15 @@ namespace
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
|
||||||
__global__ void cuSolveLowerLevelSetSplit(T* mat,
|
__global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
|
||||||
int* rowIndices,
|
int* rowIndices,
|
||||||
int* colIndices,
|
int* colIndices,
|
||||||
int* indexConversion,
|
int* indexConversion,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* d,
|
const LinearSolverScalar* d,
|
||||||
T* v)
|
LinearSolverScalar* v)
|
||||||
{
|
{
|
||||||
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
@ -282,30 +302,31 @@ namespace
|
|||||||
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
||||||
const int naturalRowIdx = indexConversion[reorderedIdx];
|
const int naturalRowIdx = indexConversion[reorderedIdx];
|
||||||
|
|
||||||
T rhs[blocksize];
|
LinearSolverScalar rhs[blocksize];
|
||||||
for (int i = 0; i < blocksize; i++) {
|
for (int i = 0; i < blocksize; i++) {
|
||||||
rhs[i] = d[naturalRowIdx * blocksize + i];
|
rhs[i] = d[naturalRowIdx * blocksize + i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||||
const int col = colIndices[block];
|
const int col = colIndices[block];
|
||||||
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||||
|
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
v[naturalRowIdx * blocksize + i] = rhs[i];
|
v[naturalRowIdx * blocksize + i] = LinearSolverScalar(rhs[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
|
||||||
__global__ void cuSolveUpperLevelSetSplit(T* mat,
|
__global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
|
||||||
int* rowIndices,
|
int* rowIndices,
|
||||||
int* colIndices,
|
int* colIndices,
|
||||||
int* indexConversion,
|
int* indexConversion,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const MatrixScalar* dInv,
|
||||||
T* v)
|
LinearSolverScalar* v)
|
||||||
{
|
{
|
||||||
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
@ -313,17 +334,17 @@ namespace
|
|||||||
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
||||||
const int naturalRowIdx = indexConversion[reorderedIdx];
|
const int naturalRowIdx = indexConversion[reorderedIdx];
|
||||||
|
|
||||||
T rhs[blocksize];
|
LinearSolverScalar rhs[blocksize];
|
||||||
for (int i = 0; i < blocksize; i++) {
|
for (int i = 0; i < blocksize; i++) {
|
||||||
rhs[i] = v[naturalRowIdx * blocksize + i];
|
rhs[i] = v[naturalRowIdx * blocksize + i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||||
const int col = colIndices[block];
|
const int col = colIndices[block];
|
||||||
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
mv<T, blocksize>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -365,41 +386,41 @@ solveUpperLevelSet(T* reorderedMat,
|
|||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
|
||||||
void
|
void
|
||||||
solveLowerLevelSetSplit(T* reorderedMat,
|
solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
|
||||||
int* rowIndices,
|
int* rowIndices,
|
||||||
int* colIndices,
|
int* colIndices,
|
||||||
int* indexConversion,
|
int* indexConversion,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* d,
|
const LinearSolverScalar* d,
|
||||||
T* v,
|
LinearSolverScalar* v,
|
||||||
int thrBlockSize)
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
|
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
|
||||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||||
}
|
}
|
||||||
// perform the upper solve for all rows in the same level set
|
// perform the upper solve for all rows in the same level set
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
|
||||||
void
|
void
|
||||||
solveUpperLevelSetSplit(T* reorderedMat,
|
solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||||
int* rowIndices,
|
int* rowIndices,
|
||||||
int* colIndices,
|
int* colIndices,
|
||||||
int* indexConversion,
|
int* indexConversion,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const MatrixScalar* dInv,
|
||||||
T* v,
|
LinearSolverScalar* v,
|
||||||
int thrBlockSize)
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
|
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
|
||||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -421,34 +442,42 @@ LUFactorization(T* srcMatrix,
|
|||||||
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
|
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class InputScalar, class OutputScalar>
|
||||||
void
|
void
|
||||||
LUFactorizationSplit(T* reorderedLowerMat,
|
LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||||
int* lowerRowIndices,
|
int* lowerRowIndices,
|
||||||
int* lowerColIndices,
|
int* lowerColIndices,
|
||||||
T* reorderedUpperMat,
|
InputScalar* reorderedUpperMat,
|
||||||
int* upperRowIndices,
|
int* upperRowIndices,
|
||||||
int* upperColIndices,
|
int* upperColIndices,
|
||||||
T* diagonal,
|
InputScalar* srcDiagonal,
|
||||||
|
OutputScalar* dstReorderedLowerMat,
|
||||||
|
OutputScalar* dstReorderedUpperMat,
|
||||||
|
OutputScalar* dstDiagonal,
|
||||||
int* reorderedToNatural,
|
int* reorderedToNatural,
|
||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
const int startIdx,
|
const int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
|
bool copyResultToOtherMatrix,
|
||||||
int thrBlockSize)
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize
|
int threadBlockSize
|
||||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
|
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar>, thrBlockSize);
|
||||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
|
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||||
lowerRowIndices,
|
lowerRowIndices,
|
||||||
lowerColIndices,
|
lowerColIndices,
|
||||||
reorderedUpperMat,
|
reorderedUpperMat,
|
||||||
upperRowIndices,
|
upperRowIndices,
|
||||||
upperColIndices,
|
upperColIndices,
|
||||||
diagonal,
|
srcDiagonal,
|
||||||
|
dstReorderedLowerMat,
|
||||||
|
dstReorderedUpperMat,
|
||||||
|
dstDiagonal,
|
||||||
reorderedToNatural,
|
reorderedToNatural,
|
||||||
naturalToReordered,
|
naturalToReordered,
|
||||||
startIdx,
|
startIdx,
|
||||||
|
copyResultToOtherMatrix,
|
||||||
rowsInLevelSet);
|
rowsInLevelSet);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -456,10 +485,12 @@ LUFactorizationSplit(T* reorderedLowerMat,
|
|||||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
|
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
|
||||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
|
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
|
||||||
template void LUFactorizationSplit<T, blocksize>( \
|
template void LUFactorizationSplit<blocksize, T, float>( \
|
||||||
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int); \
|
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, bool, int); \
|
||||||
template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
template void LUFactorizationSplit<blocksize, T, double>( \
|
||||||
template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
|
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, bool, int);
|
||||||
|
// template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
|
// template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
|
||||||
|
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||||
@ -473,4 +504,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
|
|||||||
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||||
|
|
||||||
|
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
|
||||||
|
/* double preconditioner */\
|
||||||
|
template void solveLowerLevelSetSplit<blocksize, double, double>(double*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||||
|
/* float matrix, double compute preconditioner */\
|
||||||
|
template void solveLowerLevelSetSplit<blocksize, double, float>(float*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||||
|
/* float preconditioner */\
|
||||||
|
template void solveLowerLevelSetSplit<blocksize, float, float>(float*, int*, int*, int*, int, int, const float*, float*, int); \
|
||||||
|
\
|
||||||
|
/* double preconditioner */\
|
||||||
|
template void solveUpperLevelSetSplit<blocksize, double, double>(double*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||||
|
/* float matrix, double compute preconditioner */\
|
||||||
|
template void solveUpperLevelSetSplit<blocksize, double, float>(float*, int*, int*, int*, int, int, const float*, double*, int); \
|
||||||
|
/* float preconditioner */\
|
||||||
|
template void solveUpperLevelSetSplit<blocksize, float, float>(float*, int*, int*, int*, int, int, const float*, float*, int); \
|
||||||
|
|
||||||
|
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
|
||||||
|
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2);
|
||||||
|
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3);
|
||||||
|
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(4);
|
||||||
|
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(5);
|
||||||
|
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(6);
|
||||||
} // namespace Opm::gpuistl::detail::ILU0
|
} // namespace Opm::gpuistl::detail::ILU0
|
||||||
|
@ -89,15 +89,15 @@ void solveLowerLevelSet(T* reorderedMat,
|
|||||||
* solve
|
* solve
|
||||||
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
|
||||||
void solveUpperLevelSetSplit(T* reorderedMat,
|
void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||||
int* rowIndices,
|
int* rowIndices,
|
||||||
int* colIndices,
|
int* colIndices,
|
||||||
int* indexConversion,
|
int* indexConversion,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const MatrixScalar* dInv,
|
||||||
T* v,
|
LinearSolverScalar* v,
|
||||||
int threadBlockSize);
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -117,15 +117,15 @@ void solveUpperLevelSetSplit(T* reorderedMat,
|
|||||||
* solve
|
* solve
|
||||||
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
|
||||||
void solveLowerLevelSetSplit(T* reorderedLowerMat,
|
void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
|
||||||
int* rowIndices,
|
int* rowIndices,
|
||||||
int* colIndices,
|
int* colIndices,
|
||||||
int* indexConversion,
|
int* indexConversion,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* d,
|
const LinearSolverScalar* d,
|
||||||
T* v,
|
LinearSolverScalar* v,
|
||||||
int threadBlockSize);
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -155,6 +155,7 @@ void LUFactorization(T* reorderedMat,
|
|||||||
int threadBlockSize);
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* TODO: update this doucmentation
|
||||||
* @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
|
* @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
|
||||||
* triangular part)
|
* triangular part)
|
||||||
* @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
|
* @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
|
||||||
@ -175,18 +176,22 @@ void LUFactorization(T* reorderedMat,
|
|||||||
* function
|
* function
|
||||||
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <int blocksize, class InputScalar, class OutputScalar>
|
||||||
void LUFactorizationSplit(T* reorderedLowerMat,
|
void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||||
int* lowerRowIndices,
|
int* lowerRowIndices,
|
||||||
int* lowerColIndices,
|
int* lowerColIndices,
|
||||||
T* reorderedUpperMat,
|
InputScalar* srcReorderedUpperMat,
|
||||||
int* upperRowIndices,
|
int* upperRowIndices,
|
||||||
int* upperColIndices,
|
int* upperColIndices,
|
||||||
T* diagonal,
|
InputScalar* srcDiagonal,
|
||||||
|
OutputScalar* dstReorderedLowerMat,
|
||||||
|
OutputScalar* dstReorderedUpperMat,
|
||||||
|
OutputScalar* dstDiagonal,
|
||||||
int* reorderedToNatural,
|
int* reorderedToNatural,
|
||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
|
bool copyResultToOtherMatrix,
|
||||||
int threadBlockSize);
|
int threadBlockSize);
|
||||||
|
|
||||||
} // namespace Opm::gpuistl::detail::ILU0
|
} // namespace Opm::gpuistl::detail::ILU0
|
||||||
|
Loading…
Reference in New Issue
Block a user