mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #5433 from multitalentloes/useRecomendedBlockSize
Autotune thread block size
This commit is contained in:
commit
f3b5e0d14d
@ -347,10 +347,12 @@ struct StandardPreconditioners {
|
|||||||
});
|
});
|
||||||
|
|
||||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||||
const bool split_matrix = prm.get<double>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
// const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
using field_type = typename V::field_type;
|
using field_type = typename V::field_type;
|
||||||
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||||
auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix);
|
auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
|
||||||
|
|
||||||
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
|
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
|
||||||
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||||
@ -605,12 +607,15 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
|||||||
|
|
||||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
using field_type = typename V::field_type;
|
using field_type = typename V::field_type;
|
||||||
using CUDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
using CUDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||||
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat(), split_matrix));
|
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat(), split_matrix, tune_gpu_kernels));
|
||||||
});
|
});
|
||||||
|
|
||||||
F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
using block_type = typename V::block_type;
|
using block_type = typename V::block_type;
|
||||||
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
||||||
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
||||||
@ -618,7 +623,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
|||||||
using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
|
using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
|
||||||
using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
||||||
auto converted = std::make_shared<Converter>(op.getmat());
|
auto converted = std::make_shared<Converter>(op.getmat());
|
||||||
auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
|
auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels));
|
||||||
converted->setUnderlyingPreconditioner(adapted);
|
converted->setUnderlyingPreconditioner(adapted);
|
||||||
return converted;
|
return converted;
|
||||||
});
|
});
|
||||||
|
@ -30,11 +30,16 @@
|
|||||||
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
||||||
#include <opm/simulators/linalg/matrixblock.hh>
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <config.h>
|
||||||
|
#include <chrono>
|
||||||
|
#include <limits>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
std::vector<int>
|
std::vector<int>
|
||||||
createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
|
createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
|
||||||
{
|
{
|
||||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||||
int globCnt = 0;
|
int globCnt = 0;
|
||||||
@ -49,7 +54,7 @@ createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int>
|
std::vector<int>
|
||||||
createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
|
createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
|
||||||
{
|
{
|
||||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||||
int globCnt = 0;
|
int globCnt = 0;
|
||||||
@ -66,7 +71,7 @@ createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
|
|||||||
template <class M, class field_type, class GPUM>
|
template <class M, class field_type, class GPUM>
|
||||||
void
|
void
|
||||||
createReorderedMatrix(const M& naturalMatrix,
|
createReorderedMatrix(const M& naturalMatrix,
|
||||||
std::vector<int> reorderedToNatural,
|
std::vector<int>& reorderedToNatural,
|
||||||
std::unique_ptr<GPUM>& reorderedGpuMat)
|
std::unique_ptr<GPUM>& reorderedGpuMat)
|
||||||
{
|
{
|
||||||
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
|
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
|
||||||
@ -84,7 +89,7 @@ createReorderedMatrix(const M& naturalMatrix,
|
|||||||
template <class M, class field_type, class GPUM>
|
template <class M, class field_type, class GPUM>
|
||||||
void
|
void
|
||||||
extractLowerAndUpperMatrices(const M& naturalMatrix,
|
extractLowerAndUpperMatrices(const M& naturalMatrix,
|
||||||
std::vector<int> reorderedToNatural,
|
std::vector<int>& reorderedToNatural,
|
||||||
std::unique_ptr<GPUM>& lower,
|
std::unique_ptr<GPUM>& lower,
|
||||||
std::unique_ptr<GPUM>& upper)
|
std::unique_ptr<GPUM>& upper)
|
||||||
{
|
{
|
||||||
@ -119,7 +124,7 @@ namespace Opm::cuistl
|
|||||||
{
|
{
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
CuDILU<M, X, Y, l>::CuDILU(const M& A, bool split_matrix)
|
CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
||||||
: m_cpuMatrix(A)
|
: m_cpuMatrix(A)
|
||||||
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
||||||
, m_reorderedToNatural(createReorderedToNatural(m_levelSets))
|
, m_reorderedToNatural(createReorderedToNatural(m_levelSets))
|
||||||
@ -128,7 +133,8 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool split_matrix)
|
|||||||
, m_gpuNaturalToReorder(m_naturalToReordered)
|
, m_gpuNaturalToReorder(m_naturalToReordered)
|
||||||
, m_gpuReorderToNatural(m_reorderedToNatural)
|
, m_gpuReorderToNatural(m_reorderedToNatural)
|
||||||
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
|
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
|
||||||
, m_splitMatrix(split_matrix)
|
, m_splitMatrix(splitMatrix)
|
||||||
|
, m_tuneThreadBlockSizes(tuneKernels)
|
||||||
|
|
||||||
{
|
{
|
||||||
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
||||||
@ -156,6 +162,14 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool split_matrix)
|
|||||||
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
|
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
|
||||||
}
|
}
|
||||||
computeDiagAndMoveReorderedData();
|
computeDiagAndMoveReorderedData();
|
||||||
|
|
||||||
|
// HIP does currently not support automtically picking thread block sizes as well as CUDA
|
||||||
|
// So only when tuning and using hip should we do our own manual tuning
|
||||||
|
#ifdef USE_HIP
|
||||||
|
if (m_tuneThreadBlockSizes){
|
||||||
|
tuneThreadBlockSizes();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
@ -183,7 +197,8 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
m_gpuDInv.data(),
|
m_gpuDInv.data(),
|
||||||
d.data(),
|
d.data(),
|
||||||
v.data());
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::computeLowerSolveLevelSet<field_type, blocksize_>(
|
detail::computeLowerSolveLevelSet<field_type, blocksize_>(
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
@ -194,7 +209,8 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
m_gpuDInv.data(),
|
m_gpuDInv.data(),
|
||||||
d.data(),
|
d.data(),
|
||||||
v.data());
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
}
|
}
|
||||||
levelStartIdx += numOfRowsInLevel;
|
levelStartIdx += numOfRowsInLevel;
|
||||||
}
|
}
|
||||||
@ -213,7 +229,8 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
levelStartIdx,
|
levelStartIdx,
|
||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
m_gpuDInv.data(),
|
m_gpuDInv.data(),
|
||||||
v.data());
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::computeUpperSolveLevelSet<field_type, blocksize_>(
|
detail::computeUpperSolveLevelSet<field_type, blocksize_>(
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
@ -223,7 +240,8 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
levelStartIdx,
|
levelStartIdx,
|
||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
m_gpuDInv.data(),
|
m_gpuDInv.data(),
|
||||||
v.data());
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,14 +288,16 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
m_gpuMatrixReorderedDiag->data(),
|
m_gpuMatrixReorderedDiag->data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
m_gpuMatrixReorderedLower->N());
|
m_gpuMatrixReorderedLower->N(),
|
||||||
|
m_updateThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
m_gpuMatrixReordered->N());
|
m_gpuMatrixReordered->N(),
|
||||||
|
m_updateThreadBlockSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
int levelStartIdx = 0;
|
int levelStartIdx = 0;
|
||||||
@ -296,7 +316,8 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
levelStartIdx,
|
levelStartIdx,
|
||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
m_gpuDInv.data());
|
m_gpuDInv.data(),
|
||||||
|
m_updateThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
|
detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
@ -305,13 +326,66 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||||||
m_gpuNaturalToReorder.data(),
|
m_gpuNaturalToReorder.data(),
|
||||||
levelStartIdx,
|
levelStartIdx,
|
||||||
numOfRowsInLevel,
|
numOfRowsInLevel,
|
||||||
m_gpuDInv.data());
|
m_gpuDInv.data(),
|
||||||
|
m_updateThreadBlockSize);
|
||||||
}
|
}
|
||||||
levelStartIdx += numOfRowsInLevel;
|
levelStartIdx += numOfRowsInLevel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
||||||
|
{
|
||||||
|
// TODO: generalize this code and put it somewhere outside of this class
|
||||||
|
long long bestApplyTime = std::numeric_limits<long long>::max();
|
||||||
|
long long bestUpdateTime = std::numeric_limits<long long>::max();
|
||||||
|
int bestApplyBlockSize = -1;
|
||||||
|
int bestUpdateBlockSize = -1;
|
||||||
|
int interval = 64;
|
||||||
|
|
||||||
|
//temporary buffers for the apply
|
||||||
|
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
|
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
|
tmpD = 1;
|
||||||
|
|
||||||
|
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
|
||||||
|
// sometimes the first kernel launch kan be slower, so take the time twice
|
||||||
|
for (int i = 0; i < 2; ++i){
|
||||||
|
|
||||||
|
auto beforeUpdate = std::chrono::high_resolution_clock::now();
|
||||||
|
m_updateThreadBlockSize = thrBlockSize;
|
||||||
|
update();
|
||||||
|
std::ignore = cudaDeviceSynchronize();
|
||||||
|
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
||||||
|
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
|
||||||
|
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
||||||
|
if (durationInMicroSec < bestUpdateTime){
|
||||||
|
bestUpdateTime = durationInMicroSec;
|
||||||
|
bestUpdateBlockSize = thrBlockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto beforeApply = std::chrono::high_resolution_clock::now();
|
||||||
|
m_applyThreadBlockSize = thrBlockSize;
|
||||||
|
apply(tmpV, tmpD);
|
||||||
|
std::ignore = cudaDeviceSynchronize();
|
||||||
|
auto afterApply = std::chrono::high_resolution_clock::now();
|
||||||
|
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
|
||||||
|
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
||||||
|
if (durationInMicroSec < bestApplyTime){
|
||||||
|
bestApplyTime = durationInMicroSec;
|
||||||
|
bestApplyBlockSize = thrBlockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_applyThreadBlockSize = bestApplyBlockSize;
|
||||||
|
m_updateThreadBlockSize = bestUpdateBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Opm::cuistl
|
} // namespace Opm::cuistl
|
||||||
#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \
|
#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \
|
||||||
template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>, \
|
template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>, \
|
||||||
|
@ -66,7 +66,7 @@ public:
|
|||||||
//! \param A The matrix to operate on.
|
//! \param A The matrix to operate on.
|
||||||
//! \param w The relaxation factor.
|
//! \param w The relaxation factor.
|
||||||
//!
|
//!
|
||||||
explicit CuDILU(const M& A, bool split_matrix = true);
|
explicit CuDILU(const M& A, bool splitMatrix, bool tuneKernels);
|
||||||
|
|
||||||
//! \brief Prepare the preconditioner.
|
//! \brief Prepare the preconditioner.
|
||||||
//! \note Does nothing at the time being.
|
//! \note Does nothing at the time being.
|
||||||
@ -88,6 +88,9 @@ public:
|
|||||||
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
||||||
void computeDiagAndMoveReorderedData();
|
void computeDiagAndMoveReorderedData();
|
||||||
|
|
||||||
|
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||||
|
void tuneThreadBlockSizes();
|
||||||
|
|
||||||
|
|
||||||
//! \returns false
|
//! \returns false
|
||||||
static constexpr bool shouldCallPre()
|
static constexpr bool shouldCallPre()
|
||||||
@ -130,6 +133,12 @@ private:
|
|||||||
CuVector<field_type> m_gpuDInv;
|
CuVector<field_type> m_gpuDInv;
|
||||||
//! \brief Bool storing whether or not we should store matrices in a split format
|
//! \brief Bool storing whether or not we should store matrices in a split format
|
||||||
bool m_splitMatrix;
|
bool m_splitMatrix;
|
||||||
|
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
|
||||||
|
bool m_tuneThreadBlockSizes;
|
||||||
|
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||||
|
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||||
|
int m_applyThreadBlockSize = -1;
|
||||||
|
int m_updateThreadBlockSize = -1;
|
||||||
};
|
};
|
||||||
} // end namespace Opm::cuistl
|
} // end namespace Opm::cuistl
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
namespace Opm::cuistl::detail
|
namespace Opm::cuistl::detail
|
||||||
{
|
{
|
||||||
@ -505,25 +506,22 @@ namespace
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfRows)
|
|
||||||
{
|
|
||||||
return 1024;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline size_t getBlocks(size_t numberOfRows)
|
|
||||||
{
|
|
||||||
const auto threads = getThreads(numberOfRows);
|
|
||||||
return (numberOfRows + threads - 1) / threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Kernel here is the function object of the cuda kernel
|
// Kernel here is the function object of the cuda kernel
|
||||||
template <class Kernel>
|
template <class Kernel>
|
||||||
inline int getCudaRecomendedThreadBlockSize(Kernel k)
|
inline int getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize=-1)
|
||||||
{
|
{
|
||||||
int blockSize;
|
if (suggestedThrBlockSize != -1){
|
||||||
|
return suggestedThrBlockSize;
|
||||||
|
}
|
||||||
|
// Use cuda API to maximize occupancy, otherwise we just pick a thread block size if it is not tuned
|
||||||
|
#if USE_HIP
|
||||||
|
return 512;
|
||||||
|
#else
|
||||||
|
int blockSize = 1024;
|
||||||
int tmpGridSize;
|
int tmpGridSize;
|
||||||
cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
|
std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
|
||||||
return blockSize;
|
return blockSize;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
|
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
|
||||||
@ -538,8 +536,10 @@ void
|
|||||||
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||||
{
|
{
|
||||||
if (blocksize <= 3) {
|
if (blocksize <= 3) {
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
||||||
cuInvertDiagonalAndFlatten<T, blocksize>
|
cuInvertDiagonalAndFlatten<T, blocksize>
|
||||||
<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(mat, rowIndices, colIndices, numberOfRows, vec);
|
<<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
|
||||||
} else {
|
} else {
|
||||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||||
}
|
}
|
||||||
@ -556,9 +556,12 @@ computeLowerSolveLevelSet(T* reorderedMat,
|
|||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
const T* d,
|
const T* d,
|
||||||
T* v)
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
cuComputeLowerSolveLevelSet<T, blocksize><<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSet<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuComputeLowerSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -573,9 +576,10 @@ computeLowerSolveLevelSetSplit(T* reorderedMat,
|
|||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
const T* d,
|
const T* d,
|
||||||
T* v)
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>, thrBlockSize);
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||||
@ -590,9 +594,12 @@ computeUpperSolveLevelSet(T* reorderedMat,
|
|||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
T* v)
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
cuComputeUpperSolveLevelSet<T, blocksize><<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSet<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuComputeUpperSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -605,9 +612,10 @@ computeUpperSolveLevelSetSplit(T* reorderedMat,
|
|||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
T* v)
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSetSplit<T, blocksize>, thrBlockSize);
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||||
@ -622,11 +630,14 @@ computeDiluDiagonal(T* reorderedMat,
|
|||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
const int startIdx,
|
const int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
T* dInv)
|
T* dInv,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
if (blocksize <= 3) {
|
if (blocksize <= 3) {
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuComputeDiluDiagonal<T, blocksize>
|
cuComputeDiluDiagonal<T, blocksize>
|
||||||
<<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(reorderedMat,
|
<<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
|
||||||
rowIndices,
|
rowIndices,
|
||||||
colIndices,
|
colIndices,
|
||||||
reorderedToNatural,
|
reorderedToNatural,
|
||||||
@ -652,10 +663,11 @@ computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
const int startIdx,
|
const int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
T* dInv)
|
T* dInv,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
if (blocksize <= 3) {
|
if (blocksize <= 3) {
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
|
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
|
||||||
lowerRowIndices,
|
lowerRowIndices,
|
||||||
@ -677,9 +689,12 @@ computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
void
|
void
|
||||||
copyMatDataToReordered(
|
copyMatDataToReordered(
|
||||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows)
|
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
cuMoveDataToReordered<T, blocksize><<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
||||||
|
cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
|
srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -694,9 +709,10 @@ copyMatDataToReorderedSplit(T* srcMatrix,
|
|||||||
int* dstUpperRowIndices,
|
int* dstUpperRowIndices,
|
||||||
T* dstDiag,
|
T* dstDiag,
|
||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
size_t numberOfRows)
|
size_t numberOfRows,
|
||||||
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
||||||
cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
|
cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
|
||||||
srcRowIndices,
|
srcRowIndices,
|
||||||
@ -712,15 +728,15 @@ copyMatDataToReorderedSplit(T* srcMatrix,
|
|||||||
|
|
||||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||||
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \
|
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \
|
||||||
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t); \
|
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int); \
|
||||||
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t); \
|
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); \
|
||||||
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*); \
|
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
|
||||||
template void computeDiluDiagonalSplit<T, blocksize>( \
|
template void computeDiluDiagonalSplit<T, blocksize>( \
|
||||||
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*); \
|
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \
|
||||||
template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \
|
template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*); \
|
template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
|
||||||
template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \
|
template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*);
|
template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
|
||||||
|
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||||
|
@ -59,7 +59,8 @@ void computeLowerSolveLevelSet(T* reorderedMat,
|
|||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
const T* d,
|
const T* d,
|
||||||
T* v);
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
@ -86,7 +87,8 @@ void computeLowerSolveLevelSetSplit(T* reorderedUpperMat,
|
|||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
const T* d,
|
const T* d,
|
||||||
T* v);
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
@ -111,7 +113,8 @@ void computeUpperSolveLevelSet(T* reorderedMat,
|
|||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
T* v);
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -136,7 +139,8 @@ void computeUpperSolveLevelSetSplit(T* reorderedUpperMat,
|
|||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
const T* dInv,
|
const T* dInv,
|
||||||
T* v);
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
||||||
@ -162,7 +166,8 @@ void computeDiluDiagonal(T* reorderedMat,
|
|||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
T* dInv);
|
T* dInv,
|
||||||
|
int threadBlockSize);
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -197,7 +202,8 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
int startIdx,
|
int startIdx,
|
||||||
int rowsInLevelSet,
|
int rowsInLevelSet,
|
||||||
T* dInv);
|
T* dInv,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
|
* @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
|
||||||
@ -211,7 +217,7 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
void copyMatDataToReordered(
|
void copyMatDataToReordered(
|
||||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows);
|
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
|
* @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
|
||||||
@ -229,7 +235,7 @@ void copyMatDataToReordered(
|
|||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
void copyMatDataToReorderedSplit(
|
void copyMatDataToReorderedSplit(
|
||||||
T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows);
|
T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
|
||||||
|
|
||||||
} // namespace Opm::cuistl::detail
|
} // namespace Opm::cuistl::detail
|
||||||
#endif
|
#endif
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <config.h>
|
||||||
namespace Opm::cuistl::detail
|
namespace Opm::cuistl::detail
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -115,10 +116,31 @@ namespace
|
|||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
// Kernel here is the function object of the cuda kernel
|
||||||
|
template <class Kernel>
|
||||||
|
inline int getCudaRecomendedThreadBlockSize(Kernel k)
|
||||||
|
{
|
||||||
|
#if USE_HIP
|
||||||
|
return 512;
|
||||||
|
#else
|
||||||
|
int blockSize;
|
||||||
|
int tmpGridSize;
|
||||||
|
std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
|
||||||
|
return blockSize;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
|
||||||
|
{
|
||||||
|
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void
|
void
|
||||||
setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
|
setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
|
||||||
{
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
setVectorValueKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
setVectorValueKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||||
deviceData, numberOfElements, value);
|
deviceData, numberOfElements, value);
|
||||||
}
|
}
|
||||||
@ -131,6 +153,8 @@ template <class T>
|
|||||||
void
|
void
|
||||||
setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
|
setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
setZeroAtIndexSetKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
setZeroAtIndexSetKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||||
deviceData, numberOfElements, indices);
|
deviceData, numberOfElements, indices);
|
||||||
}
|
}
|
||||||
@ -142,6 +166,8 @@ template <class T>
|
|||||||
T
|
T
|
||||||
innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
|
innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
elementWiseMultiplyKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
elementWiseMultiplyKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||||
deviceA, deviceB, buffer, numberOfElements, indices);
|
deviceA, deviceB, buffer, numberOfElements, indices);
|
||||||
|
|
||||||
@ -160,6 +186,8 @@ template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int*
|
|||||||
template <class T>
|
template <class T>
|
||||||
void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
|
prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
|
||||||
OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
|
OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
|
||||||
}
|
}
|
||||||
@ -170,6 +198,8 @@ template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfEle
|
|||||||
template <class T>
|
template <class T>
|
||||||
void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
|
syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
|
||||||
//cudaDeviceSynchronize(); // Not needed, I guess...
|
//cudaDeviceSynchronize(); // Not needed, I guess...
|
||||||
}
|
}
|
||||||
@ -188,16 +218,28 @@ weightedDiagMV(const T* squareBlockVector,
|
|||||||
{
|
{
|
||||||
switch (blocksize) {
|
switch (blocksize) {
|
||||||
case 1:
|
case 1:
|
||||||
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||||
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||||
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
|
{
|
||||||
|
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
|
||||||
|
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||||
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
|
OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
|
||||||
|
@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApply)
|
|||||||
|
|
||||||
// Initialize preconditioner objects
|
// Initialize preconditioner objects
|
||||||
Dune::MultithreadDILU<Sp1x1BlockMatrix, B1x1Vec, B1x1Vec> cpudilu(matA);
|
Dune::MultithreadDILU<Sp1x1BlockMatrix, B1x1Vec, B1x1Vec> cpudilu(matA);
|
||||||
auto gpudilu = CuDilu1x1(matA);
|
auto gpudilu = CuDilu1x1(matA, true, true);
|
||||||
|
|
||||||
// Use the apply
|
// Use the apply
|
||||||
gpudilu.apply(d_output, d_input);
|
gpudilu.apply(d_output, d_input);
|
||||||
@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApplyBlocked)
|
|||||||
|
|
||||||
// init matrix with 2x2 blocks
|
// init matrix with 2x2 blocks
|
||||||
Sp2x2BlockMatrix matA = get2x2BlockTestMatrix();
|
Sp2x2BlockMatrix matA = get2x2BlockTestMatrix();
|
||||||
auto gpudilu = CuDilu2x2(matA);
|
auto gpudilu = CuDilu2x2(matA, true, true);
|
||||||
Dune::MultithreadDILU<Sp2x2BlockMatrix, B2x2Vec, B2x2Vec> cpudilu(matA);
|
Dune::MultithreadDILU<Sp2x2BlockMatrix, B2x2Vec, B2x2Vec> cpudilu(matA);
|
||||||
|
|
||||||
// create input/output buffers for the apply
|
// create input/output buffers for the apply
|
||||||
@ -275,7 +275,7 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge)
|
|||||||
{
|
{
|
||||||
// create gpu dilu preconditioner
|
// create gpu dilu preconditioner
|
||||||
Sp1x1BlockMatrix matA = get1x1BlockTestMatrix();
|
Sp1x1BlockMatrix matA = get1x1BlockTestMatrix();
|
||||||
auto gpudilu = CuDilu1x1(matA);
|
auto gpudilu = CuDilu1x1(matA, true, true);
|
||||||
|
|
||||||
matA[0][0][0][0] = 11.0;
|
matA[0][0][0][0] = 11.0;
|
||||||
matA[0][1][0][0] = 12.0;
|
matA[0][1][0][0] = 12.0;
|
||||||
|
Loading…
Reference in New Issue
Block a user