mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #5451 from multitalentloes/generalize_thread_block_tuner
Generalize thread block tuner
This commit is contained in:
commit
c4f686227b
@ -224,6 +224,7 @@ if (HAVE_CUDA)
|
|||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
|
||||||
|
|
||||||
# HEADERS
|
# HEADERS
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/autotuner.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
#include <opm/common/TimingMacros.hpp>
|
#include <opm/common/TimingMacros.hpp>
|
||||||
#include <opm/simulators/linalg/GraphColoring.hpp>
|
#include <opm/simulators/linalg/GraphColoring.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
@ -33,6 +34,9 @@
|
|||||||
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
|
||||||
#include <opm/simulators/linalg/matrixblock.hh>
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
#include <functional>
|
||||||
|
#include <utility>
|
||||||
|
#include <string>
|
||||||
namespace Opm::cuistl
|
namespace Opm::cuistl
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -71,18 +75,16 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
|||||||
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
||||||
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
|
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
|
||||||
m_reorderedToNatural);
|
m_reorderedToNatural);
|
||||||
|
}
|
||||||
|
else {
|
||||||
m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
||||||
m_cpuMatrix, m_reorderedToNatural);
|
m_cpuMatrix, m_reorderedToNatural);
|
||||||
}
|
}
|
||||||
computeDiagAndMoveReorderedData();
|
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||||
|
|
||||||
// HIP does currently not support automtically picking thread block sizes as well as CUDA
|
|
||||||
// So only when tuning and using hip should we do our own manual tuning
|
|
||||||
#ifdef USE_HIP
|
|
||||||
if (m_tuneThreadBlockSizes) {
|
if (m_tuneThreadBlockSizes) {
|
||||||
tuneThreadBlockSizes();
|
tuneThreadBlockSizes();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
@ -97,69 +99,77 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
{
|
{
|
||||||
OPM_TIMEBLOCK(prec_apply);
|
OPM_TIMEBLOCK(prec_apply);
|
||||||
{
|
{
|
||||||
int levelStartIdx = 0;
|
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
}
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
}
|
||||||
if (m_splitMatrix) {
|
|
||||||
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
|
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
m_gpuDInv.data(),
|
|
||||||
d.data(),
|
|
||||||
v.data(),
|
|
||||||
m_applyThreadBlockSize);
|
|
||||||
} else {
|
|
||||||
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
m_gpuDInv.data(),
|
|
||||||
d.data(),
|
|
||||||
v.data(),
|
|
||||||
m_applyThreadBlockSize);
|
|
||||||
}
|
|
||||||
levelStartIdx += numOfRowsInLevel;
|
|
||||||
}
|
|
||||||
|
|
||||||
levelStartIdx = m_cpuMatrix.N();
|
template <class M, class X, class Y, int l>
|
||||||
// upper triangular solve: (D + U_A) v = Dy
|
void
|
||||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
CuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
{
|
||||||
levelStartIdx -= numOfRowsInLevel;
|
int levelStartIdx = 0;
|
||||||
if (m_splitMatrix) {
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
if (m_splitMatrix) {
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
levelStartIdx,
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
numOfRowsInLevel,
|
m_gpuReorderToNatural.data(),
|
||||||
m_gpuDInv.data(),
|
levelStartIdx,
|
||||||
v.data(),
|
numOfRowsInLevel,
|
||||||
m_applyThreadBlockSize);
|
m_gpuDInv.data(),
|
||||||
} else {
|
d.data(),
|
||||||
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
v.data(),
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
lowerSolveThreadBlockSize);
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
} else {
|
||||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
levelStartIdx,
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
numOfRowsInLevel,
|
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||||
m_gpuDInv.data(),
|
m_gpuReorderToNatural.data(),
|
||||||
v.data(),
|
levelStartIdx,
|
||||||
m_applyThreadBlockSize);
|
numOfRowsInLevel,
|
||||||
}
|
m_gpuDInv.data(),
|
||||||
|
d.data(),
|
||||||
|
v.data(),
|
||||||
|
lowerSolveThreadBlockSize);
|
||||||
|
}
|
||||||
|
levelStartIdx += numOfRowsInLevel;
|
||||||
|
}
|
||||||
|
|
||||||
|
levelStartIdx = m_cpuMatrix.N();
|
||||||
|
// upper triangular solve: (D + U_A) v = Dy
|
||||||
|
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
levelStartIdx -= numOfRowsInLevel;
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
m_gpuDInv.data(),
|
||||||
|
v.data(),
|
||||||
|
upperSolveThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
m_gpuDInv.data(),
|
||||||
|
v.data(),
|
||||||
|
upperSolveThreadBlockSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
void
|
void
|
||||||
CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
|
CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
|
||||||
@ -179,72 +189,76 @@ CuDILU<M, X, Y, l>::update()
|
|||||||
{
|
{
|
||||||
OPM_TIMEBLOCK(prec_update);
|
OPM_TIMEBLOCK(prec_update);
|
||||||
{
|
{
|
||||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||||
computeDiagAndMoveReorderedData();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
void
|
void
|
||||||
CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
CuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
|
||||||
{
|
{
|
||||||
OPM_TIMEBLOCK(prec_update);
|
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||||
{
|
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
|
||||||
|
{
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrix.getNonZeroValues().data(),
|
||||||
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
|
m_gpuMatrix.getColumnIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedDiag->data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuMatrixReorderedLower->N(),
|
||||||
|
moveThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||||
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuMatrixReordered->N(),
|
||||||
|
moveThreadBlockSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
int levelStartIdx = 0;
|
||||||
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
|
||||||
m_gpuMatrix.getNonZeroValues().data(),
|
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
|
||||||
m_gpuMatrix.getColumnIndices().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
m_gpuMatrixReorderedDiag->data(),
|
m_gpuMatrixReorderedDiag->data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
m_gpuMatrixReorderedLower->N(),
|
levelStartIdx,
|
||||||
m_updateThreadBlockSize);
|
numOfRowsInLevel,
|
||||||
|
m_gpuDInv.data(),
|
||||||
|
factorizationBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuReorderToNatural.data(),
|
||||||
m_gpuMatrixReordered->N(),
|
m_gpuNaturalToReorder.data(),
|
||||||
m_updateThreadBlockSize);
|
levelStartIdx,
|
||||||
}
|
numOfRowsInLevel,
|
||||||
|
m_gpuDInv.data(),
|
||||||
int levelStartIdx = 0;
|
factorizationBlockSize);
|
||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
|
||||||
if (m_splitMatrix) {
|
|
||||||
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
|
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
|
||||||
m_gpuMatrixReorderedDiag->data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
m_gpuNaturalToReorder.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
m_gpuDInv.data(),
|
|
||||||
m_updateThreadBlockSize);
|
|
||||||
} else {
|
|
||||||
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
|
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
m_gpuNaturalToReorder.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
m_gpuDInv.data(),
|
|
||||||
m_updateThreadBlockSize);
|
|
||||||
}
|
|
||||||
levelStartIdx += numOfRowsInLevel;
|
|
||||||
}
|
}
|
||||||
|
levelStartIdx += numOfRowsInLevel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -252,54 +266,31 @@ template <class M, class X, class Y, int l>
|
|||||||
void
|
void
|
||||||
CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
||||||
{
|
{
|
||||||
// TODO: generalize this code and put it somewhere outside of this class
|
// tune the thread-block size of the update function
|
||||||
long long bestApplyTime = std::numeric_limits<long long>::max();
|
auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
|
||||||
long long bestUpdateTime = std::numeric_limits<long long>::max();
|
this->update(moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||||
int bestApplyBlockSize = -1;
|
};
|
||||||
int bestUpdateBlockSize = -1;
|
m_moveThreadBlockSize = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate, "Kernel moving data to reordered matrix");
|
||||||
int interval = 64;
|
|
||||||
|
|
||||||
// temporary buffers for the apply
|
auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize){
|
||||||
|
this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
|
||||||
|
};
|
||||||
|
m_DILUFactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing DILU factorization");
|
||||||
|
|
||||||
|
// tune the thread-block size of the apply
|
||||||
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
tmpD = 1;
|
tmpD = 1;
|
||||||
|
|
||||||
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
|
auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
|
||||||
// sometimes the first kernel launch kan be slower, so take the time twice
|
this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||||
for (int i = 0; i < 2; ++i) {
|
};
|
||||||
|
m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneLowerSolveThreadBlockSizeInApply, "Kernel computing a lower triangular solve for a level set");
|
||||||
|
|
||||||
auto beforeUpdate = std::chrono::high_resolution_clock::now();
|
auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize){
|
||||||
m_updateThreadBlockSize = thrBlockSize;
|
this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
|
||||||
update();
|
};
|
||||||
std::ignore = cudaDeviceSynchronize();
|
m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply, "Kernel computing an upper triangular solve for a level set");
|
||||||
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
|
||||||
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
|
||||||
long long durationInMicroSec
|
|
||||||
= std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
|
||||||
if (durationInMicroSec < bestUpdateTime) {
|
|
||||||
bestUpdateTime = durationInMicroSec;
|
|
||||||
bestUpdateBlockSize = thrBlockSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto beforeApply = std::chrono::high_resolution_clock::now();
|
|
||||||
m_applyThreadBlockSize = thrBlockSize;
|
|
||||||
apply(tmpV, tmpD);
|
|
||||||
std::ignore = cudaDeviceSynchronize();
|
|
||||||
auto afterApply = std::chrono::high_resolution_clock::now();
|
|
||||||
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
|
||||||
long long durationInMicroSec
|
|
||||||
= std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
|
||||||
if (durationInMicroSec < bestApplyTime) {
|
|
||||||
bestApplyTime = durationInMicroSec;
|
|
||||||
bestApplyBlockSize = thrBlockSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m_applyThreadBlockSize = bestApplyBlockSize;
|
|
||||||
m_updateThreadBlockSize = bestUpdateBlockSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Opm::cuistl
|
} // namespace Opm::cuistl
|
||||||
|
@ -80,7 +80,7 @@ public:
|
|||||||
void update() final;
|
void update() final;
|
||||||
|
|
||||||
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
||||||
void computeDiagAndMoveReorderedData();
|
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||||
|
|
||||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||||
void tuneThreadBlockSizes();
|
void tuneThreadBlockSizes();
|
||||||
@ -104,6 +104,10 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
//! \brief Apply the preconditoner.
|
||||||
|
void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
|
||||||
|
//! \brief Updates the matrix data.
|
||||||
|
void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||||
//! \brief Reference to the underlying matrix
|
//! \brief Reference to the underlying matrix
|
||||||
const M& m_cpuMatrix;
|
const M& m_cpuMatrix;
|
||||||
//! \brief size_t describing the dimensions of the square block elements
|
//! \brief size_t describing the dimensions of the square block elements
|
||||||
@ -135,8 +139,10 @@ private:
|
|||||||
bool m_tuneThreadBlockSizes;
|
bool m_tuneThreadBlockSizes;
|
||||||
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||||
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||||
int m_applyThreadBlockSize = -1;
|
int m_upperSolveThreadBlockSize = -1;
|
||||||
int m_updateThreadBlockSize = -1;
|
int m_lowerSolveThreadBlockSize = -1;
|
||||||
|
int m_moveThreadBlockSize = -1;
|
||||||
|
int m_DILUFactorizationThreadBlockSize = -1;
|
||||||
};
|
};
|
||||||
} // end namespace Opm::cuistl
|
} // end namespace Opm::cuistl
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
#include <dune/common/fmatrix.hh>
|
#include <dune/common/fmatrix.hh>
|
||||||
#include <dune/istl/bcrsmatrix.hh>
|
#include <dune/istl/bcrsmatrix.hh>
|
||||||
#include <fmt/core.h>
|
#include <fmt/core.h>
|
||||||
|
#include <functional>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
#include <opm/common/TimingMacros.hpp>
|
#include <opm/common/TimingMacros.hpp>
|
||||||
@ -28,11 +29,14 @@
|
|||||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
|
#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
|
||||||
#include <opm/simulators/linalg/matrixblock.hh>
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
|
#include <string>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
#include <utility>
|
||||||
namespace Opm::cuistl
|
namespace Opm::cuistl
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -76,13 +80,11 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
|
|||||||
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
||||||
m_cpuMatrix, m_reorderedToNatural);
|
m_cpuMatrix, m_reorderedToNatural);
|
||||||
}
|
}
|
||||||
LUFactorizeAndMoveData();
|
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||||
|
|
||||||
#ifdef USE_HIP
|
|
||||||
if (m_tuneThreadBlockSizes) {
|
if (m_tuneThreadBlockSizes) {
|
||||||
tuneThreadBlockSizes();
|
tuneThreadBlockSizes();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
@ -97,59 +99,66 @@ OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
{
|
{
|
||||||
OPM_TIMEBLOCK(prec_apply);
|
OPM_TIMEBLOCK(prec_apply);
|
||||||
{
|
{
|
||||||
int levelStartIdx = 0;
|
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
}
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
}
|
||||||
if (m_splitMatrix) {
|
|
||||||
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
|
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
d.data(),
|
|
||||||
v.data(),
|
|
||||||
m_applyThreadBlockSize);
|
|
||||||
} else {
|
|
||||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
|
||||||
m_gpuReorderedLU->getRowIndices().data(),
|
|
||||||
m_gpuReorderedLU->getColumnIndices().data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
d.data(),
|
|
||||||
v.data(),
|
|
||||||
m_applyThreadBlockSize);
|
|
||||||
}
|
|
||||||
levelStartIdx += numOfRowsInLevel;
|
|
||||||
}
|
|
||||||
|
|
||||||
levelStartIdx = m_cpuMatrix.N();
|
template <class M, class X, class Y, int l>
|
||||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
void
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
|
||||||
levelStartIdx -= numOfRowsInLevel;
|
{
|
||||||
if (m_splitMatrix) {
|
int levelStartIdx = 0;
|
||||||
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
if (m_splitMatrix) {
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
levelStartIdx,
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
numOfRowsInLevel,
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
m_gpuMatrixReorderedDiag.value().data(),
|
m_gpuReorderToNatural.data(),
|
||||||
v.data(),
|
levelStartIdx,
|
||||||
m_applyThreadBlockSize);
|
numOfRowsInLevel,
|
||||||
} else {
|
d.data(),
|
||||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
v.data(),
|
||||||
m_gpuReorderedLU->getRowIndices().data(),
|
lowerSolveThreadBlockSize);
|
||||||
m_gpuReorderedLU->getColumnIndices().data(),
|
} else {
|
||||||
m_gpuReorderToNatural.data(),
|
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
levelStartIdx,
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
numOfRowsInLevel,
|
m_gpuReorderedLU->getColumnIndices().data(),
|
||||||
v.data(),
|
m_gpuReorderToNatural.data(),
|
||||||
m_applyThreadBlockSize);
|
levelStartIdx,
|
||||||
}
|
numOfRowsInLevel,
|
||||||
|
d.data(),
|
||||||
|
v.data(),
|
||||||
|
lowerSolveThreadBlockSize);
|
||||||
|
}
|
||||||
|
levelStartIdx += numOfRowsInLevel;
|
||||||
|
}
|
||||||
|
|
||||||
|
levelStartIdx = m_cpuMatrix.N();
|
||||||
|
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
levelStartIdx -= numOfRowsInLevel;
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
v.data(),
|
||||||
|
upperSolveThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
|
m_gpuReorderedLU->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
v.data(),
|
||||||
|
upperSolveThreadBlockSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -173,70 +182,76 @@ OpmCuILU0<M, X, Y, l>::update()
|
|||||||
{
|
{
|
||||||
OPM_TIMEBLOCK(prec_update);
|
OPM_TIMEBLOCK(prec_update);
|
||||||
{
|
{
|
||||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||||
LUFactorizeAndMoveData();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
void
|
void
|
||||||
OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
|
OpmCuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadBlockSize)
|
||||||
{
|
{
|
||||||
OPM_TIMEBLOCK(prec_update);
|
OPM_TIMEBLOCK(prec_update);
|
||||||
{
|
{
|
||||||
|
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||||
|
LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
|
||||||
|
{
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrix.getNonZeroValues().data(),
|
||||||
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
|
m_gpuMatrix.getColumnIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuMatrixReorderedLower->N(),
|
||||||
|
moveThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||||
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
|
m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuReorderedLU->N(),
|
||||||
|
moveThreadBlockSize);
|
||||||
|
}
|
||||||
|
int levelStartIdx = 0;
|
||||||
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
|
||||||
m_gpuMatrix.getNonZeroValues().data(),
|
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
|
||||||
m_gpuMatrix.getColumnIndices().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
m_gpuMatrixReorderedDiag.value().data(),
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
m_gpuMatrixReorderedLower->N(),
|
levelStartIdx,
|
||||||
m_updateThreadBlockSize);
|
numOfRowsInLevel,
|
||||||
|
factorizationThreadBlockSize);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
m_gpuReorderedLU->getNonZeroValues().data(),
|
m_gpuReorderedLU->getColumnIndices().data(),
|
||||||
m_gpuReorderedLU->getRowIndices().data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuReorderToNatural.data(),
|
||||||
m_gpuReorderedLU->N(),
|
numOfRowsInLevel,
|
||||||
m_updateThreadBlockSize);
|
levelStartIdx,
|
||||||
}
|
factorizationThreadBlockSize);
|
||||||
int levelStartIdx = 0;
|
|
||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
|
||||||
|
|
||||||
if (m_splitMatrix) {
|
|
||||||
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
|
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
|
||||||
m_gpuMatrixReorderedDiag.value().data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
m_gpuNaturalToReorder.data(),
|
|
||||||
levelStartIdx,
|
|
||||||
numOfRowsInLevel,
|
|
||||||
m_updateThreadBlockSize);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
|
||||||
m_gpuReorderedLU->getRowIndices().data(),
|
|
||||||
m_gpuReorderedLU->getColumnIndices().data(),
|
|
||||||
m_gpuNaturalToReorder.data(),
|
|
||||||
m_gpuReorderToNatural.data(),
|
|
||||||
numOfRowsInLevel,
|
|
||||||
levelStartIdx,
|
|
||||||
m_updateThreadBlockSize);
|
|
||||||
}
|
|
||||||
levelStartIdx += numOfRowsInLevel;
|
|
||||||
}
|
}
|
||||||
|
levelStartIdx += numOfRowsInLevel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,54 +259,34 @@ template <class M, class X, class Y, int l>
|
|||||||
void
|
void
|
||||||
OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
|
OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
|
||||||
{
|
{
|
||||||
// TODO generalize this tuning process in a function separate of the class
|
// tune the thread-block size of the update function
|
||||||
long long bestApplyTime = std::numeric_limits<long long>::max();
|
auto tuneMoveThreadBlockSizeInUpdate
|
||||||
long long bestUpdateTime = std::numeric_limits<long long>::max();
|
= [this](int moveThreadBlockSize) { this->update(moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); };
|
||||||
int bestApplyBlockSize = -1;
|
m_moveThreadBlockSize
|
||||||
int bestUpdateBlockSize = -1;
|
= detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate, "Kernel moving data to reordered matrix");
|
||||||
int interval = 64;
|
|
||||||
|
|
||||||
// temporary buffers for the apply
|
auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize) {
|
||||||
|
this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
|
||||||
|
};
|
||||||
|
m_ILU0FactorizationThreadBlockSize
|
||||||
|
= detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing ILU0 factorization");
|
||||||
|
|
||||||
|
// tune the thread-block size of the apply
|
||||||
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
tmpD = 1;
|
tmpD = 1;
|
||||||
|
|
||||||
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
|
auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize) {
|
||||||
// sometimes the first kernel launch kan be slower, so take the time twice
|
this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||||
for (int i = 0; i < 2; ++i) {
|
};
|
||||||
|
m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(
|
||||||
|
tuneLowerSolveThreadBlockSizeInApply, "Kernel computing a lower triangular solve for a level set");
|
||||||
|
|
||||||
auto beforeUpdate = std::chrono::high_resolution_clock::now();
|
auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize) {
|
||||||
m_updateThreadBlockSize = thrBlockSize;
|
this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
|
||||||
update();
|
};
|
||||||
std::ignore = cudaDeviceSynchronize();
|
m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(
|
||||||
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
tuneUpperSolveThreadBlockSizeInApply, "Kernel computing an upper triangular solve for a level set");
|
||||||
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
|
||||||
long long durationInMicroSec
|
|
||||||
= std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
|
||||||
if (durationInMicroSec < bestUpdateTime) {
|
|
||||||
bestUpdateTime = durationInMicroSec;
|
|
||||||
bestUpdateBlockSize = thrBlockSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto beforeApply = std::chrono::high_resolution_clock::now();
|
|
||||||
m_applyThreadBlockSize = thrBlockSize;
|
|
||||||
apply(tmpV, tmpD);
|
|
||||||
std::ignore = cudaDeviceSynchronize();
|
|
||||||
auto afterApply = std::chrono::high_resolution_clock::now();
|
|
||||||
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
|
||||||
long long durationInMicroSec
|
|
||||||
= std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
|
||||||
if (durationInMicroSec < bestApplyTime) {
|
|
||||||
bestApplyTime = durationInMicroSec;
|
|
||||||
bestApplyBlockSize = thrBlockSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m_applyThreadBlockSize = bestApplyBlockSize;
|
|
||||||
m_updateThreadBlockSize = bestUpdateBlockSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Opm::cuistl
|
} // namespace Opm::cuistl
|
||||||
|
@ -82,7 +82,7 @@ public:
|
|||||||
void update() final;
|
void update() final;
|
||||||
|
|
||||||
//! \brief Compute LU factorization, and update the data of the reordered matrix
|
//! \brief Compute LU factorization, and update the data of the reordered matrix
|
||||||
void LUFactorizeAndMoveData();
|
void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||||
|
|
||||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||||
void tuneThreadBlockSizes();
|
void tuneThreadBlockSizes();
|
||||||
@ -105,6 +105,10 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
//! \brief Apply the preconditoner.
|
||||||
|
void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
|
||||||
|
//! \brief Updates the matrix data.
|
||||||
|
void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||||
//! \brief Reference to the underlying matrix
|
//! \brief Reference to the underlying matrix
|
||||||
const M& m_cpuMatrix;
|
const M& m_cpuMatrix;
|
||||||
//! \brief size_t describing the dimensions of the square block elements
|
//! \brief size_t describing the dimensions of the square block elements
|
||||||
@ -135,8 +139,10 @@ private:
|
|||||||
bool m_tuneThreadBlockSizes;
|
bool m_tuneThreadBlockSizes;
|
||||||
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||||
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||||
int m_applyThreadBlockSize = -1;
|
int m_upperSolveThreadBlockSize = -1;
|
||||||
int m_updateThreadBlockSize = -1;
|
int m_lowerSolveThreadBlockSize = -1;
|
||||||
|
int m_moveThreadBlockSize = -1;
|
||||||
|
int m_ILU0FactorizationThreadBlockSize = -1;
|
||||||
};
|
};
|
||||||
} // end namespace Opm::cuistl
|
} // end namespace Opm::cuistl
|
||||||
|
|
||||||
|
93
opm/simulators/linalg/cuistl/detail/autotuner.hpp
Normal file
93
opm/simulators/linalg/cuistl/detail/autotuner.hpp
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_AUTOTUNER_HPP
|
||||||
|
#define OPM_AUTOTUNER_HPP
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <functional>
|
||||||
|
#include <limits>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
namespace Opm::cuistl::detail
|
||||||
|
{
|
||||||
|
|
||||||
|
/// @brief Function that tests the best thread block size, assumes the provided function depends on threadblock-size
|
||||||
|
/// @tparam The type of the function to tune
|
||||||
|
/// @param f the function to tune, which takes the thread block size as the input
|
||||||
|
template <typename func>
|
||||||
|
int
|
||||||
|
tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
|
||||||
|
{
|
||||||
|
// This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
|
||||||
|
// 64 is chosen so it is a multiple of the AMD wavefront size.
|
||||||
|
// The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
|
||||||
|
// We time the kernel with each possible threadblock-size, and return the one
|
||||||
|
// that gave the fastest invidivual run.
|
||||||
|
|
||||||
|
// TODO: figure out a more rigorous way of deciding how many runs will suffice?
|
||||||
|
constexpr const int runs = 2;
|
||||||
|
cudaEvent_t events[runs + 1];
|
||||||
|
|
||||||
|
// create the events
|
||||||
|
for (int i = 0; i < runs + 1; ++i) {
|
||||||
|
OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize helper variables
|
||||||
|
float bestTime = std::numeric_limits<float>::max();
|
||||||
|
int bestBlockSize = -1;
|
||||||
|
int interval = 64;
|
||||||
|
|
||||||
|
// try each possible blocksize
|
||||||
|
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
|
||||||
|
|
||||||
|
// record a first event, and then an event after each kernel
|
||||||
|
OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
|
||||||
|
for (int i = 0; i < runs; ++i) {
|
||||||
|
f(thrBlockSize); // runs an arbitrary function with the provided arguments
|
||||||
|
OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i + 1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// make suret he runs are over
|
||||||
|
OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));
|
||||||
|
|
||||||
|
// kernel launch was valid
|
||||||
|
if (cudaSuccess == cudaGetLastError()) {
|
||||||
|
// check if we beat the record for the fastest kernel
|
||||||
|
for (int i = 0; i < runs; ++i) {
|
||||||
|
float candidateBlockSizeTime;
|
||||||
|
OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
|
||||||
|
if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
|
||||||
|
bestTime = candidateBlockSizeTime;
|
||||||
|
bestBlockSize = thrBlockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
OpmLog::info(
|
||||||
|
fmt::format("{}: Tuned Blocksize: {} (fastest runtime: {}).", descriptionOfFunction, bestBlockSize, bestTime));
|
||||||
|
|
||||||
|
return bestBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end namespace Opm::cuistl::detail
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user