mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
use cudagraphs in gpu DILU and ILU0 apply
expected to give performance gain on current nvidia cards, no significant change on AMD cards for cases of reasonable size uses new GPU convenence objects
This commit is contained in:
parent
2babeb848d
commit
66fa971079
@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
|
||||
{
|
||||
copyFromHost(data.data(), data.size());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void
|
||||
GpuBuffer<T>::copyToHost(std::vector<T>& data) const
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
namespace Opm::gpuistl
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
|
||||
namespace Opm::gpuistl
|
||||
{
|
||||
|
||||
@ -93,7 +94,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
|
||||
}
|
||||
}
|
||||
|
||||
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
computeDiagonal(m_DILUFactorizationThreadBlockSize);
|
||||
|
||||
if (m_tuneThreadBlockSizes) {
|
||||
tuneThreadBlockSizes();
|
||||
@ -110,10 +112,31 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
const auto ptrs = std::make_pair(v.data(), d.data());
|
||||
|
||||
auto it = m_apply_graphs.find(ptrs);
|
||||
|
||||
if (it == m_apply_graphs.end()) {
|
||||
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
|
||||
|
||||
// The apply functions contains lots of small function calls which call a kernel each
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
|
||||
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
|
||||
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
|
||||
}
|
||||
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
|
||||
}
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
@ -135,7 +158,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInvFloat->data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
|
||||
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
|
||||
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
|
||||
@ -147,7 +171,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
@ -159,7 +184,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||
@ -172,7 +198,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
@ -193,7 +220,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInvFloat->data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
|
||||
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
|
||||
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
|
||||
@ -204,7 +232,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
@ -215,7 +244,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||
@ -227,7 +257,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -252,7 +283,9 @@ GpuDILU<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
computeDiagonal(m_DILUFactorizationThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
@ -260,13 +293,22 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
|
||||
reorderAndSplitMatrix(moveThreadBlockSize);
|
||||
computeDiagonal(factorizationBlockSize);
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
|
||||
GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
|
||||
{
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
@ -290,7 +332,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
|
||||
m_gpuMatrixReordered->N(),
|
||||
moveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
@ -24,8 +24,10 @@
|
||||
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include <map>
|
||||
#include <utility>
|
||||
|
||||
|
||||
namespace Opm::gpuistl
|
||||
@ -82,8 +84,11 @@ public:
|
||||
//! \brief Updates the matrix data.
|
||||
void update() final;
|
||||
|
||||
//! \brief perform matrix splitting and reordering
|
||||
void reorderAndSplitMatrix(int moveThreadBlockSize);
|
||||
|
||||
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
||||
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
void computeDiagonal(int factorizationThreadBlockSize);
|
||||
|
||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||
void tuneThreadBlockSizes();
|
||||
@ -153,6 +158,16 @@ private:
|
||||
int m_lowerSolveThreadBlockSize = -1;
|
||||
int m_moveThreadBlockSize = -1;
|
||||
int m_DILUFactorizationThreadBlockSize = -1;
|
||||
|
||||
// Graphs for Apply
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
|
||||
|
||||
// Stream for the DILU operations on the GPU
|
||||
GPUStream m_stream{};
|
||||
// Events for synchronization with main stream
|
||||
GPUEvent m_before{};
|
||||
GPUEvent m_after{};
|
||||
};
|
||||
} // end namespace Opm::gpuistl
|
||||
|
||||
|
@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
|
||||
OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void
|
||||
GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
|
||||
{
|
||||
if (numberOfElements > dim()) {
|
||||
OPM_THROW(std::runtime_error,
|
||||
fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
|
||||
dim(),
|
||||
numberOfElements));
|
||||
}
|
||||
OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void
|
||||
GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
|
||||
|
@ -203,6 +203,7 @@ public:
|
||||
* @note assumes that this vector has numberOfElements elements
|
||||
*/
|
||||
void copyFromHost(const T* dataPointer, size_t numberOfElements);
|
||||
void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
namespace Opm::gpuistl
|
||||
{
|
||||
|
||||
@ -58,7 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
||||
, m_tuneThreadBlockSizes(tuneKernels)
|
||||
, m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
|
||||
{
|
||||
|
||||
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
||||
// Some sanity check
|
||||
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
|
||||
@ -98,7 +98,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
||||
}
|
||||
}
|
||||
|
||||
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
|
||||
|
||||
if (m_tuneThreadBlockSizes) {
|
||||
tuneThreadBlockSizes();
|
||||
@ -117,7 +118,29 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
const auto ptrs = std::make_pair(v.data(), d.data());
|
||||
|
||||
auto it = m_apply_graphs.find(ptrs);
|
||||
|
||||
if (it == m_apply_graphs.end()) {
|
||||
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
|
||||
|
||||
// The apply functions contains lots of small function calls which call a kernel each
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
|
||||
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
|
||||
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
|
||||
}
|
||||
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
|
||||
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
}
|
||||
|
||||
@ -142,7 +165,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
else{
|
||||
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
|
||||
@ -154,7 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
@ -165,7 +190,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
@ -185,7 +211,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiagFloat.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
|
||||
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
|
||||
@ -197,7 +224,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
else{
|
||||
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
|
||||
@ -209,7 +237,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
@ -219,7 +248,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -241,10 +271,9 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmGpuILU0<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
}
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
@ -253,13 +282,23 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
|
||||
reorderAndSplitMatrix(moveThreadBlockSize);
|
||||
LUFactorizeMatrix(factorizationThreadBlockSize);
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
|
||||
OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
|
||||
{
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
@ -283,6 +322,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
|
||||
m_gpuReorderedLU->N(),
|
||||
moveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
@ -84,8 +85,11 @@ public:
|
||||
//! \brief Updates the matrix data.
|
||||
void update() final;
|
||||
|
||||
//! \brief perform matrix splitting and reordering
|
||||
void reorderAndSplitMatrix(int moveThreadBlockSize);
|
||||
|
||||
//! \brief Compute LU factorization, and update the data of the reordered matrix
|
||||
void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
void LUFactorizeMatrix(int factorizationThreadBlockSize);
|
||||
|
||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||
void tuneThreadBlockSizes();
|
||||
@ -152,6 +156,16 @@ private:
|
||||
int m_lowerSolveThreadBlockSize = -1;
|
||||
int m_moveThreadBlockSize = -1;
|
||||
int m_ILU0FactorizationThreadBlockSize = -1;
|
||||
|
||||
// Graphs for Apply
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
|
||||
|
||||
// Stream for the DILU operations on the GPU
|
||||
GPUStream m_stream{};
|
||||
// Events for synchronization with main stream
|
||||
GPUEvent m_before{};
|
||||
GPUEvent m_after{};
|
||||
};
|
||||
} // end namespace Opm::gpuistl
|
||||
|
||||
|
@ -85,10 +85,12 @@ namespace
|
||||
// TODO: removce the first condition in the for loop
|
||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||
const int col = colIndices[block];
|
||||
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
}
|
||||
|
||||
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,10 +139,12 @@ namespace
|
||||
LinearSolverScalar rhs[blocksize] = {0};
|
||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||
const int col = colIndices[block];
|
||||
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
}
|
||||
|
||||
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -211,8 +215,8 @@ namespace
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
|
||||
// TOOD: The important part is to only cast after that is fully computed
|
||||
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
|
||||
// results TOOD: The important part is to only cast after that is fully computed
|
||||
template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
|
||||
__global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
|
||||
int* lowerRowIndices,
|
||||
@ -239,7 +243,8 @@ namespace
|
||||
InputScalar dInvTmp[blocksize * blocksize];
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
|
||||
dInvTmp[i * blocksize + j]
|
||||
= srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
|
||||
}
|
||||
}
|
||||
|
||||
@ -257,21 +262,26 @@ namespace
|
||||
|
||||
if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
|
||||
// TODO: think long and hard about whether this performs only the wanted memory transfers
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(
|
||||
&srcReorderedLowerMat[block * blocksize * blocksize],
|
||||
&dstLowerMat[block * blocksize * blocksize]);
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(
|
||||
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||
&dstUpperMat[symOppositeBlock * blocksize * blocksize]);
|
||||
}
|
||||
|
||||
mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
|
||||
&dInv[col * blocksize * blocksize],
|
||||
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||
dInvTmp);
|
||||
&dInv[col * blocksize * blocksize],
|
||||
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||
dInvTmp);
|
||||
}
|
||||
|
||||
invBlockInPlace<InputScalar, blocksize>(dInvTmp);
|
||||
moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
|
||||
|
||||
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(
|
||||
dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
}
|
||||
|
||||
@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
const DiagonalScalar* dInv,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
}
|
||||
// perform the upper solve for all rows in the same level set
|
||||
template <class T, int blocksize>
|
||||
@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||
lowerRowIndices,
|
||||
lowerColIndices,
|
||||
srcReorderedUpperMat,
|
||||
upperRowIndices,
|
||||
upperColIndices,
|
||||
srcDiagonal,
|
||||
reorderedToNatural,
|
||||
naturalToReordered,
|
||||
startIdx,
|
||||
rowsInLevelSet,
|
||||
dInv,
|
||||
dstDiag,
|
||||
dstLowerMat,
|
||||
dstUpperMat);
|
||||
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
|
||||
<<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||
lowerRowIndices,
|
||||
lowerColIndices,
|
||||
srcReorderedUpperMat,
|
||||
upperRowIndices,
|
||||
upperColIndices,
|
||||
srcDiagonal,
|
||||
reorderedToNatural,
|
||||
naturalToReordered,
|
||||
startIdx,
|
||||
rowsInLevelSet,
|
||||
dInv,
|
||||
dstDiag,
|
||||
dstLowerMat,
|
||||
dstUpperMat);
|
||||
} else {
|
||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: format
|
||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
double*, \
|
||||
double*, \
|
||||
double*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
float*, \
|
||||
float*, \
|
||||
float*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
float*, \
|
||||
float*, \
|
||||
float*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
double*, \
|
||||
double*, \
|
||||
double*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
float*, \
|
||||
float*, \
|
||||
float*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
double*, \
|
||||
double*, \
|
||||
double*, \
|
||||
int); \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
|
||||
template void solveLowerLevelSet<T, blocksize>( \
|
||||
T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
|
||||
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||
@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
|
||||
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \
|
||||
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
|
||||
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \
|
||||
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, \
|
||||
int*, \
|
||||
int*, \
|
||||
int*, \
|
||||
int, \
|
||||
int, \
|
||||
const DiagonalScalar*, \
|
||||
const LinearSolverScalar*, \
|
||||
LinearSolverScalar*, \
|
||||
int, \
|
||||
cudaStream_t);
|
||||
|
||||
// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
|
||||
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
|
||||
|
@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
|
||||
const DiagonalScalar* dInv,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
||||
|
@ -215,9 +215,9 @@ namespace
|
||||
// as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
|
||||
// and sometimes also the diagonal.
|
||||
if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
|
||||
// if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
|
||||
// if not then we just use the double diagonal that is already now stored in srcDiagonal
|
||||
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
|
||||
// if we are want to store the entire matrix as a float then we must also move the diagonal block from
|
||||
// double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
|
||||
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
|
||||
&dstDiagonal[reorderedIdx * scalarsInBlock]);
|
||||
}
|
||||
@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* d,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||
}
|
||||
// perform the upper solve for all rows in the same level set
|
||||
@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
|
||||
}
|
||||
|
||||
@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||
}
|
||||
// perform the upper solve for all rows in the same level set
|
||||
template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
|
||||
@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
}
|
||||
|
||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
|
||||
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
|
||||
|
||||
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
|
||||
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 6);
|
||||
|
||||
INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
|
||||
@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
|
||||
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
|
||||
/* double preconditioner */ \
|
||||
template void solveLowerLevelSetSplit<blocksize, double, double>( \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float matrix, double compute preconditioner */ \
|
||||
template void solveLowerLevelSetSplit<blocksize, double, float>( \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float preconditioner */ \
|
||||
template void solveLowerLevelSetSplit<blocksize, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int); \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); \
|
||||
\
|
||||
/* double preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float matrix, double compute preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, float*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t); \
|
||||
/* double preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
|
||||
double*, int*, int*, int*, int, int, const float*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
|
||||
double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
|
||||
/* float matrix, double compute preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
|
||||
/* float preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int);
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);
|
||||
|
||||
|
||||
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
|
||||
|
@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* d,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
|
||||
int rowsInLevelSet,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
|
||||
|
Loading…
Reference in New Issue
Block a user