mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
make autotuner use lambda that only depends on blocksize
This commit is contained in:
@@ -74,10 +74,12 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
||||
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
||||
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
|
||||
m_reorderedToNatural);
|
||||
}
|
||||
else {
|
||||
m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
||||
m_cpuMatrix, m_reorderedToNatural);
|
||||
}
|
||||
computeDiagAndMoveReorderedData();
|
||||
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
|
||||
if (m_tuneThreadBlockSizes) {
|
||||
tuneThreadBlockSizes();
|
||||
@@ -96,69 +98,77 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
if (m_splitMatrix) {
|
||||
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_lowerSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_lowerSolveThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
levelStartIdx = m_cpuMatrix.N();
|
||||
// upper triangular solve: (D + U_A) v = Dy
|
||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
levelStartIdx -= numOfRowsInLevel;
|
||||
if (m_splitMatrix) {
|
||||
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
m_upperSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
m_upperSolveThreadBlockSize);
|
||||
}
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
if (m_splitMatrix) {
|
||||
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
||||
levelStartIdx = m_cpuMatrix.N();
|
||||
// upper triangular solve: (D + U_A) v = Dy
|
||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
levelStartIdx -= numOfRowsInLevel;
|
||||
if (m_splitMatrix) {
|
||||
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
|
||||
@@ -178,72 +188,76 @@ CuDILU<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
computeDiagAndMoveReorderedData();
|
||||
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
||||
CuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
|
||||
{
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedDiag->data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReorderedLower->N(),
|
||||
moveThreadBlockSize);
|
||||
} else {
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReordered->N(),
|
||||
moveThreadBlockSize);
|
||||
}
|
||||
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag->data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReorderedLower->N(),
|
||||
m_moveThreadBlockSize);
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
factorizationBlockSize);
|
||||
} else {
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReordered->N(),
|
||||
m_moveThreadBlockSize);
|
||||
}
|
||||
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
if (m_splitMatrix) {
|
||||
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag->data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
m_DILUFactorizationThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
m_DILUFactorizationThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
factorizationBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,21 +265,31 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
||||
{
|
||||
using CuDILUType = std::remove_reference_t<decltype(*this)>;
|
||||
|
||||
// tune the thread-block size of the update function
|
||||
auto updateFunc = std::bind(&CuDILUType::update, this);
|
||||
detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
|
||||
detail::tuneThreadBlockSize(updateFunc, m_DILUFactorizationThreadBlockSize);
|
||||
auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
|
||||
this->update(moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
};
|
||||
m_moveThreadBlockSize = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate);
|
||||
|
||||
auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize){
|
||||
this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
|
||||
};
|
||||
m_DILUFactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate);
|
||||
|
||||
// tune the thread-block size of the apply
|
||||
auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
|
||||
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
tmpD = 1;
|
||||
|
||||
detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
|
||||
detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
|
||||
auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
|
||||
this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
};
|
||||
m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneLowerSolveThreadBlockSizeInApply);
|
||||
|
||||
auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize){
|
||||
this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
|
||||
};
|
||||
m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply);
|
||||
}
|
||||
|
||||
} // namespace Opm::cuistl
|
||||
|
||||
@@ -80,7 +80,7 @@ public:
|
||||
void update() final;
|
||||
|
||||
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
||||
void computeDiagAndMoveReorderedData();
|
||||
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
|
||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||
void tuneThreadBlockSizes();
|
||||
@@ -100,6 +100,10 @@ public:
|
||||
|
||||
|
||||
private:
|
||||
//! \brief Apply the preconditoner.
|
||||
void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
|
||||
//! \brief Updates the matrix data.
|
||||
void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
//! \brief Reference to the underlying matrix
|
||||
const M& m_cpuMatrix;
|
||||
//! \brief size_t describing the dimensions of the square block elements
|
||||
|
||||
@@ -79,7 +79,7 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
|
||||
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
||||
m_cpuMatrix, m_reorderedToNatural);
|
||||
}
|
||||
LUFactorizeAndMoveData();
|
||||
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
|
||||
if (m_tuneThreadBlockSizes) {
|
||||
tuneThreadBlockSizes();
|
||||
@@ -98,59 +98,66 @@ OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
if (m_splitMatrix) {
|
||||
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_lowerSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuReorderedLU->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_lowerSolveThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
levelStartIdx = m_cpuMatrix.N();
|
||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
levelStartIdx -= numOfRowsInLevel;
|
||||
if (m_splitMatrix) {
|
||||
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
m_upperSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuReorderedLU->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
v.data(),
|
||||
m_upperSolveThreadBlockSize);
|
||||
}
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
if (m_splitMatrix) {
|
||||
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuReorderedLU->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
||||
levelStartIdx = m_cpuMatrix.N();
|
||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
levelStartIdx -= numOfRowsInLevel;
|
||||
if (m_splitMatrix) {
|
||||
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuReorderedLU->getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -174,70 +181,76 @@ OpmCuILU0<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
LUFactorizeAndMoveData();
|
||||
update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
|
||||
OpmCuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadBlockSize)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
|
||||
}
|
||||
}
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
|
||||
{
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReorderedLower->N(),
|
||||
moveThreadBlockSize);
|
||||
} else {
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuReorderedLU->N(),
|
||||
moveThreadBlockSize);
|
||||
}
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReorderedLower->N(),
|
||||
m_moveThreadBlockSize);
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
factorizationThreadBlockSize);
|
||||
|
||||
} else {
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuReorderedLU->N(),
|
||||
m_moveThreadBlockSize);
|
||||
}
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
||||
if (m_splitMatrix) {
|
||||
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_ILU0FactorizationThreadBlockSize);
|
||||
|
||||
} else {
|
||||
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuReorderedLU->getColumnIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
numOfRowsInLevel,
|
||||
levelStartIdx,
|
||||
m_ILU0FactorizationThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuReorderedLU->getColumnIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
numOfRowsInLevel,
|
||||
levelStartIdx,
|
||||
factorizationThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -245,21 +258,31 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
|
||||
{
|
||||
using CuDILUType = std::remove_reference_t<decltype(*this)>;
|
||||
|
||||
// tune the thread-block size of the update function
|
||||
auto updateFunc = std::bind(&CuDILUType::update, this);
|
||||
detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
|
||||
detail::tuneThreadBlockSize(updateFunc, m_ILU0FactorizationThreadBlockSize);
|
||||
auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
|
||||
this->update(moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
};
|
||||
m_moveThreadBlockSize = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate);
|
||||
|
||||
auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize){
|
||||
this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
|
||||
};
|
||||
m_ILU0FactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate);
|
||||
|
||||
// tune the thread-block size of the apply
|
||||
auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
|
||||
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
tmpD = 1;
|
||||
|
||||
detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
|
||||
detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
|
||||
auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
|
||||
this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
};
|
||||
m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneLowerSolveThreadBlockSizeInApply);
|
||||
|
||||
auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize){
|
||||
this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
|
||||
};
|
||||
m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply);
|
||||
}
|
||||
|
||||
} // namespace Opm::cuistl
|
||||
|
||||
@@ -82,7 +82,7 @@ public:
|
||||
void update() final;
|
||||
|
||||
//! \brief Compute LU factorization, and update the data of the reordered matrix
|
||||
void LUFactorizeAndMoveData();
|
||||
void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
|
||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||
void tuneThreadBlockSizes();
|
||||
@@ -101,6 +101,10 @@ public:
|
||||
|
||||
|
||||
private:
|
||||
//! \brief Apply the preconditoner.
|
||||
void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
|
||||
//! \brief Updates the matrix data.
|
||||
void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
//! \brief Reference to the underlying matrix
|
||||
const M& m_cpuMatrix;
|
||||
//! \brief size_t describing the dimensions of the square block elements
|
||||
|
||||
@@ -29,11 +29,9 @@ namespace Opm::cuistl::detail
|
||||
/// @brief Function that tests the best thread block size, assumes updating the reference will affect runtimes
|
||||
/// @tparam func function to tune
|
||||
/// @tparam ...Args types of the arguments needed to call the function
|
||||
/// @param f the function to tune
|
||||
/// @param threadBlockSize reference to the thread block size that will affect kernel executions
|
||||
/// @param ...args arguments needed by the function
|
||||
/// @param f the function to tune, which takes the thread block size as the input
|
||||
template <typename func, typename... Args>
|
||||
void tuneThreadBlockSize(func f, int& threadBlockSize, Args&&... args) {
|
||||
int tuneThreadBlockSize(func& f) {
|
||||
|
||||
// TODO: figure out a more rigorous way of deciding how many runs will suffice?
|
||||
constexpr const int runs = 2;
|
||||
@@ -51,13 +49,11 @@ namespace Opm::cuistl::detail
|
||||
|
||||
// try each possible blocksize
|
||||
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
|
||||
// update the blocksize
|
||||
threadBlockSize = thrBlockSize;
|
||||
|
||||
// record a first event, and then an event after each kernel
|
||||
OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
|
||||
for (int i = 0; i < runs; ++i){
|
||||
f(std::forward<Args>(args)...); // runs an arbitrary function with the provided arguments
|
||||
f(thrBlockSize); // runs an arbitrary function with the provided arguments
|
||||
OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i+1]));
|
||||
}
|
||||
|
||||
@@ -80,7 +76,7 @@ namespace Opm::cuistl::detail
|
||||
// TODO: have this only be printed if linear solver verbosity >0?
|
||||
printf("best size: %d, best time %f\n", bestBlockSize, bestTime);
|
||||
|
||||
threadBlockSize = bestBlockSize;
|
||||
return bestBlockSize;
|
||||
}
|
||||
|
||||
} // end namespace Opm::cuistl::detail
|
||||
|
||||
Reference in New Issue
Block a user