mirror of
https://github.com/OPM/opm-simulators.git
synced 2024-07-07 04:53:03 -05:00
Generalize block size tuner
This commit is contained in:
parent
a41dfc5a6e
commit
11cd20beac
|
@ -217,6 +217,7 @@ if (HAVE_CUDA)
|
|||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
|
||||
|
||||
# HEADERS
|
||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/autotuner.hpp)
|
||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
|
||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
|
||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <dune/istl/bcrsmatrix.hh>
|
||||
#include <fmt/core.h>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||
|
@ -35,6 +36,8 @@
|
|||
#include <config.h>
|
||||
#include <chrono>
|
||||
#include <tuple>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
namespace Opm::cuistl
|
||||
{
|
||||
|
||||
|
@ -80,6 +83,9 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
|||
|
||||
// HIP does currently not support automtically picking thread block sizes as well as CUDA
|
||||
// So only when tuning and using hip should we do our own manual tuning
|
||||
if (m_tuneThreadBlockSizes){
|
||||
tuneThreadBlockSizes();
|
||||
}
|
||||
#ifdef USE_HIP
|
||||
if (m_tuneThreadBlockSizes){
|
||||
tuneThreadBlockSizes();
|
||||
|
@ -99,6 +105,9 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
{
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
// cudaDeviceSynchronize();
|
||||
// auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
@ -113,7 +122,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_lowerSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
|
@ -125,7 +134,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_lowerSolveThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
@ -145,7 +154,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_upperSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
|
@ -156,9 +165,13 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_upperSolveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
// cudaDeviceSynchronize();
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
||||
// printf("Apply duration %ldus\n", duration);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -182,8 +195,15 @@ CuDILU<M, X, Y, l>::update()
|
|||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
// cudaDeviceSynchronize();
|
||||
// auto start = std::chrono::high_resolution_clock::now();
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
computeDiagAndMoveReorderedData();
|
||||
|
||||
// cudaDeviceSynchronize();
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
||||
// printf("Update duration %ldus\n", duration);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -205,7 +225,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
m_gpuMatrixReorderedDiag->data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReorderedLower->N(),
|
||||
m_updateThreadBlockSize);
|
||||
m_moveThreadBlockSize);
|
||||
} else {
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
|
@ -213,7 +233,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReordered->N(),
|
||||
m_updateThreadBlockSize);
|
||||
m_moveThreadBlockSize);
|
||||
}
|
||||
|
||||
int levelStartIdx = 0;
|
||||
|
@ -233,7 +253,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
m_updateThreadBlockSize);
|
||||
m_DILUFactorizationThreadBlockSize);
|
||||
} else {
|
||||
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered->getRowIndices().data(),
|
||||
|
@ -243,7 +263,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
m_updateThreadBlockSize);
|
||||
m_DILUFactorizationThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
@ -254,52 +274,19 @@ template <class M, class X, class Y, int l>
|
|||
void
|
||||
CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
||||
{
|
||||
// TODO: generalize this code and put it somewhere outside of this class
|
||||
long long bestApplyTime = __LONG_LONG_MAX__;
|
||||
long long bestUpdateTime = __LONG_LONG_MAX__;
|
||||
int bestApplyBlockSize = -1;
|
||||
int bestUpdateBlockSize = -1;
|
||||
int interval = 64;
|
||||
|
||||
//temporary buffers for the apply
|
||||
using CuDILUType = std::remove_reference_t<decltype(*this)>;
|
||||
auto updateFunc = std::bind(&CuDILUType::update, this);
|
||||
auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
|
||||
|
||||
detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
|
||||
detail::tuneThreadBlockSize(updateFunc, m_DILUFactorizationThreadBlockSize);
|
||||
|
||||
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
tmpD = 1;
|
||||
|
||||
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
|
||||
// sometimes the first kernel launch kan be slower, so take the time twice
|
||||
for (int i = 0; i < 2; ++i){
|
||||
|
||||
auto beforeUpdate = std::chrono::high_resolution_clock::now();
|
||||
m_updateThreadBlockSize = thrBlockSize;
|
||||
update();
|
||||
std::ignore = cudaDeviceSynchronize();
|
||||
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
||||
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
|
||||
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
||||
if (durationInMicroSec < bestUpdateTime){
|
||||
bestUpdateTime = durationInMicroSec;
|
||||
bestUpdateBlockSize = thrBlockSize;
|
||||
}
|
||||
}
|
||||
|
||||
auto beforeApply = std::chrono::high_resolution_clock::now();
|
||||
m_applyThreadBlockSize = thrBlockSize;
|
||||
apply(tmpV, tmpD);
|
||||
std::ignore = cudaDeviceSynchronize();
|
||||
auto afterApply = std::chrono::high_resolution_clock::now();
|
||||
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
|
||||
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
||||
if (durationInMicroSec < bestApplyTime){
|
||||
bestApplyTime = durationInMicroSec;
|
||||
bestApplyBlockSize = thrBlockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_applyThreadBlockSize = bestApplyBlockSize;
|
||||
m_updateThreadBlockSize = bestUpdateBlockSize;
|
||||
detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
|
||||
detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
|
||||
}
|
||||
|
||||
} // namespace Opm::cuistl
|
||||
|
|
|
@ -137,8 +137,10 @@ private:
|
|||
bool m_tuneThreadBlockSizes;
|
||||
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||
int m_applyThreadBlockSize = -1;
|
||||
int m_updateThreadBlockSize = -1;
|
||||
int m_upperSolveThreadBlockSize = -1;
|
||||
int m_lowerSolveThreadBlockSize = -1;
|
||||
int m_moveThreadBlockSize = -1;
|
||||
int m_DILUFactorizationThreadBlockSize = -1;
|
||||
};
|
||||
} // end namespace Opm::cuistl
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <dune/istl/bcrsmatrix.hh>
|
||||
#include <fmt/core.h>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuILU0_OPM_Impl.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||
|
@ -81,6 +82,9 @@ CuILU0_OPM_Impl<M, X, Y, l>::CuILU0_OPM_Impl(const M& A, bool splitMatrix, bool
|
|||
}
|
||||
computeDiagAndMoveReorderedData();
|
||||
|
||||
if (m_tuneThreadBlockSizes){
|
||||
tuneThreadBlockSizes();
|
||||
}
|
||||
#ifdef USE_HIP
|
||||
if (m_tuneThreadBlockSizes){
|
||||
tuneThreadBlockSizes();
|
||||
|
@ -98,8 +102,12 @@ template <class M, class X, class Y, int l>
|
|||
void
|
||||
CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
// ScopeTimer timer("Apply");
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
// cudaDeviceSynchronize();
|
||||
// auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
@ -114,7 +122,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_lowerSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(
|
||||
m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
|
@ -125,7 +133,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_lowerSolveThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
@ -145,7 +153,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_upperSolveThreadBlockSize);
|
||||
} else {
|
||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(
|
||||
m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
|
@ -155,9 +163,13 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
v.data(),
|
||||
m_applyThreadBlockSize);
|
||||
m_upperSolveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
// cudaDeviceSynchronize();
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
||||
// printf("Apply duration %ldus\n", duration);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -180,8 +192,16 @@ CuILU0_OPM_Impl<M, X, Y, l>::update()
|
|||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
// cudaDeviceSynchronize();
|
||||
// auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
computeDiagAndMoveReorderedData();
|
||||
|
||||
// cudaDeviceSynchronize();
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
||||
// printf("Update duration %ldus\n", duration);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -203,7 +223,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReorderedLower->N(),
|
||||
m_updateThreadBlockSize);
|
||||
m_moveThreadBlockSize);
|
||||
} else {
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
|
@ -211,7 +231,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
m_gpuReorderedLU->getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuReorderedLU->N(),
|
||||
m_updateThreadBlockSize);
|
||||
m_moveThreadBlockSize);
|
||||
}
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
|
@ -230,7 +250,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_updateThreadBlockSize);
|
||||
m_LUThreadBlockSize);
|
||||
|
||||
} else {
|
||||
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
|
@ -240,7 +260,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||
m_gpuReorderToNatural.data(),
|
||||
numOfRowsInLevel,
|
||||
levelStartIdx,
|
||||
m_updateThreadBlockSize);
|
||||
m_LUThreadBlockSize);
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
@ -251,52 +271,20 @@ template <class M, class X, class Y, int l>
|
|||
void
|
||||
CuILU0_OPM_Impl<M, X, Y, l>::tuneThreadBlockSizes()
|
||||
{
|
||||
//TODO generalize this tuning process in a function separate of the class
|
||||
long long bestApplyTime = __LONG_LONG_MAX__;
|
||||
long long bestUpdateTime = __LONG_LONG_MAX__;
|
||||
int bestApplyBlockSize = -1;
|
||||
int bestUpdateBlockSize = -1;
|
||||
int interval = 64;
|
||||
|
||||
//temporary buffers for the apply
|
||||
using CuDILUType = std::remove_reference_t<decltype(*this)>;
|
||||
auto updateFunc = std::bind(&CuDILUType::update, this);
|
||||
auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
|
||||
|
||||
detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
|
||||
detail::tuneThreadBlockSize(updateFunc, m_LUThreadBlockSize);
|
||||
|
||||
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||
tmpD = 1;
|
||||
|
||||
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
|
||||
// sometimes the first kernel launch kan be slower, so take the time twice
|
||||
for (int i = 0; i < 2; ++i){
|
||||
|
||||
auto beforeUpdate = std::chrono::high_resolution_clock::now();
|
||||
m_updateThreadBlockSize = thrBlockSize;
|
||||
update();
|
||||
std::ignore = cudaDeviceSynchronize();
|
||||
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
||||
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
|
||||
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
||||
if (durationInMicroSec < bestUpdateTime){
|
||||
bestUpdateTime = durationInMicroSec;
|
||||
bestUpdateBlockSize = thrBlockSize;
|
||||
}
|
||||
}
|
||||
|
||||
auto beforeApply = std::chrono::high_resolution_clock::now();
|
||||
m_applyThreadBlockSize = thrBlockSize;
|
||||
apply(tmpV, tmpD);
|
||||
std::ignore = cudaDeviceSynchronize();
|
||||
auto afterApply = std::chrono::high_resolution_clock::now();
|
||||
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
|
||||
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
||||
if (durationInMicroSec < bestApplyTime){
|
||||
bestApplyTime = durationInMicroSec;
|
||||
bestApplyBlockSize = thrBlockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_applyThreadBlockSize = bestApplyBlockSize;
|
||||
m_updateThreadBlockSize = bestUpdateBlockSize;
|
||||
detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
|
||||
detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
|
||||
}
|
||||
|
||||
} // namespace Opm::cuistl
|
||||
|
|
|
@ -135,8 +135,10 @@ private:
|
|||
bool m_tuneThreadBlockSizes;
|
||||
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||
int m_applyThreadBlockSize = -1;
|
||||
int m_updateThreadBlockSize = -1;
|
||||
int m_upperSolveThreadBlockSize = -1;
|
||||
int m_lowerSolveThreadBlockSize = -1;
|
||||
int m_moveThreadBlockSize = -1;
|
||||
int m_LUThreadBlockSize = -1;
|
||||
};
|
||||
} // end namespace Opm::cuistl
|
||||
|
||||
|
|
|
@ -33,6 +33,8 @@
|
|||
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
||||
#include <opm/simulators/linalg/matrixblock.hh>
|
||||
|
||||
#include <chrono>
|
||||
|
||||
// This file is based on the guide at https://docs.nvidia.com/cuda/cusparse/index.html#csrilu02_solve ,
|
||||
// it highly recommended to read that before proceeding.
|
||||
|
||||
|
@ -78,6 +80,9 @@ template <class M, class X, class Y, int l>
|
|||
void
|
||||
CuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
cudaDeviceSynchronize();
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// We need to pass the solve routine a scalar to multiply.
|
||||
// In our case this scalar is 1.0
|
||||
const field_type one = 1.0;
|
||||
|
@ -128,6 +133,10 @@ CuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
|||
|
||||
|
||||
v *= m_w;
|
||||
cudaDeviceSynchronize();
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
||||
printf("Apply duration %ldus\n", duration);
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
|
@ -147,8 +156,16 @@ template <class M, class X, class Y, int l>
|
|||
void
|
||||
CuSeqILU0<M, X, Y, l>::update()
|
||||
{
|
||||
cudaDeviceSynchronize();
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
m_LU.updateNonzeroValues(detail::makeMatrixWithNonzeroDiagonal(m_underlyingMatrix));
|
||||
createILU();
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
|
||||
printf("Update duration %ldus\n", duration);
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
|
|
91
opm/simulators/linalg/cuistl/detail/autotuner.hpp
Normal file
91
opm/simulators/linalg/cuistl/detail/autotuner.hpp
Normal file
|
@ -0,0 +1,91 @@
|
|||
/*
|
||||
Copyright 2024 SINTEF AS
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#ifndef OPM_AUTOTUNER_HPP
|
||||
#define OPM_AUTOTUNER_HPP
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <limits>
|
||||
|
||||
namespace Opm::cuistl::detail
|
||||
{
|
||||
|
||||
/// @brief Function that tests the best thread block size, assumes updating the reference will affect runtimes
|
||||
/// @tparam func function to tune
|
||||
/// @tparam ...Args types of the arguments needed to call the function
|
||||
/// @param f the function to tune
|
||||
/// @param threadBlockSize reference to the thread block size that will affect kernel executions
|
||||
/// @param ...args arguments needed by the function
|
||||
template <typename func, typename... Args>
|
||||
void tuneThreadBlockSize(func f, int& threadBlockSize, Args&&... args) {
|
||||
|
||||
// decide on a number of calibration runs and allocate space for the events
|
||||
constexpr const int runs = 2;
|
||||
cudaEvent_t events[runs+1];
|
||||
|
||||
// create the events
|
||||
for (int i = 0; i < runs + 1; ++i){
|
||||
OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
|
||||
}
|
||||
|
||||
// Initialize helper variables
|
||||
float bestTime = std::numeric_limits<float>::max();
|
||||
int bestBlockSize = -1;
|
||||
int interval = 64;
|
||||
|
||||
// try each possible blocksize
|
||||
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
|
||||
// update the blocksize
|
||||
threadBlockSize = thrBlockSize;
|
||||
|
||||
// record a first event, and then an event after each kernel
|
||||
OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
|
||||
for (int i = 0; i < runs; ++i){
|
||||
f(std::forward<Args>(args)...); // runs an arbitrary function with the provided arguments
|
||||
OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i+1]));
|
||||
}
|
||||
|
||||
// make suret he runs are over
|
||||
OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));
|
||||
|
||||
// kernel launch was valid
|
||||
if (cudaSuccess == cudaGetLastError()){
|
||||
// check if we beat the record for the fastest kernel
|
||||
for (int i = 0; i < runs; ++i){
|
||||
float candidateBlockSizeTime;
|
||||
OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i+1]));
|
||||
if (candidateBlockSizeTime < bestTime){
|
||||
bestTime = candidateBlockSizeTime;
|
||||
bestBlockSize = thrBlockSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("best size: %d, best time %f\n", bestBlockSize, bestTime);
|
||||
|
||||
threadBlockSize = bestBlockSize;
|
||||
}
|
||||
|
||||
} // end namespace Opm::cuistl::detail
|
||||
|
||||
#endif
|
|
@ -41,7 +41,7 @@ extracting sparsity structures from dune matrices and creating cusparsematrix in
|
|||
namespace Opm::cuistl::detail
|
||||
{
|
||||
inline std::vector<int>
|
||||
createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
|
||||
createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
|
||||
{
|
||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||
int globCnt = 0;
|
||||
|
@ -56,7 +56,7 @@ namespace Opm::cuistl::detail
|
|||
}
|
||||
|
||||
inline std::vector<int>
|
||||
createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
|
||||
createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
|
||||
{
|
||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||
int globCnt = 0;
|
||||
|
@ -73,7 +73,7 @@ namespace Opm::cuistl::detail
|
|||
template <class M, class field_type, class GPUM>
|
||||
inline void
|
||||
createReorderedMatrix(const M& naturalMatrix,
|
||||
std::vector<int> reorderedToNatural,
|
||||
std::vector<int>& reorderedToNatural,
|
||||
std::unique_ptr<GPUM>& reorderedGpuMat)
|
||||
{
|
||||
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
|
||||
|
@ -91,7 +91,7 @@ namespace Opm::cuistl::detail
|
|||
template <class M, class field_type, class GPUM>
|
||||
inline void
|
||||
extractLowerAndUpperMatrices(const M& naturalMatrix,
|
||||
std::vector<int> reorderedToNatural,
|
||||
std::vector<int>& reorderedToNatural,
|
||||
std::unique_ptr<GPUM>& lower,
|
||||
std::unique_ptr<GPUM>& upper)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue
Block a user