mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #5852 from multitalentloes/cudagraph_experiment
Improve gpuistl using cudaGraphs
This commit is contained in:
commit
b99b606575
@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
|
||||
{
|
||||
copyFromHost(data.data(), data.size());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void
|
||||
GpuBuffer<T>::copyToHost(std::vector<T>& data) const
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
namespace Opm::gpuistl
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
|
||||
namespace Opm::gpuistl
|
||||
{
|
||||
|
||||
@ -93,7 +94,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
|
||||
}
|
||||
}
|
||||
|
||||
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
computeDiagonal(m_DILUFactorizationThreadBlockSize);
|
||||
|
||||
if (m_tuneThreadBlockSizes) {
|
||||
tuneThreadBlockSizes();
|
||||
@ -110,10 +112,31 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
const auto ptrs = std::make_pair(v.data(), d.data());
|
||||
|
||||
auto it = m_apply_graphs.find(ptrs);
|
||||
|
||||
if (it == m_apply_graphs.end()) {
|
||||
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
|
||||
|
||||
// The apply functions contains lots of small function calls which call a kernel each
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
|
||||
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
|
||||
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
|
||||
}
|
||||
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
|
||||
}
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
@ -135,7 +158,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInvFloat->data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
|
||||
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
|
||||
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
|
||||
@ -147,7 +171,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
@ -159,7 +184,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||
@ -172,7 +198,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
@ -193,7 +220,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInvFloat->data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
|
||||
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
|
||||
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
|
||||
@ -204,7 +232,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
@ -215,7 +244,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||
@ -227,7 +257,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -252,7 +283,9 @@ GpuDILU<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
computeDiagonal(m_DILUFactorizationThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
@ -260,13 +293,22 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
|
||||
reorderAndSplitMatrix(moveThreadBlockSize);
|
||||
computeDiagonal(factorizationBlockSize);
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
|
||||
GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
|
||||
{
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
@ -290,7 +332,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
|
||||
m_gpuMatrixReordered->N(),
|
||||
moveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
@ -24,8 +24,10 @@
|
||||
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include <map>
|
||||
#include <utility>
|
||||
|
||||
|
||||
namespace Opm::gpuistl
|
||||
@ -82,8 +84,11 @@ public:
|
||||
//! \brief Updates the matrix data.
|
||||
void update() final;
|
||||
|
||||
//! \brief perform matrix splitting and reordering
|
||||
void reorderAndSplitMatrix(int moveThreadBlockSize);
|
||||
|
||||
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
|
||||
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
void computeDiagonal(int factorizationThreadBlockSize);
|
||||
|
||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||
void tuneThreadBlockSizes();
|
||||
@ -153,6 +158,16 @@ private:
|
||||
int m_lowerSolveThreadBlockSize = -1;
|
||||
int m_moveThreadBlockSize = -1;
|
||||
int m_DILUFactorizationThreadBlockSize = -1;
|
||||
|
||||
// Graphs for Apply
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
|
||||
|
||||
// Stream for the DILU operations on the GPU
|
||||
GPUStream m_stream{};
|
||||
// Events for synchronization with main stream
|
||||
GPUEvent m_before{};
|
||||
GPUEvent m_after{};
|
||||
};
|
||||
} // end namespace Opm::gpuistl
|
||||
|
||||
|
@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
|
||||
OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void
|
||||
GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
|
||||
{
|
||||
if (numberOfElements > dim()) {
|
||||
OPM_THROW(std::runtime_error,
|
||||
fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
|
||||
dim(),
|
||||
numberOfElements));
|
||||
}
|
||||
OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void
|
||||
GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
|
||||
|
@ -203,6 +203,7 @@ public:
|
||||
* @note assumes that this vector has numberOfElements elements
|
||||
*/
|
||||
void copyFromHost(const T* dataPointer, size_t numberOfElements);
|
||||
void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
namespace Opm::gpuistl
|
||||
{
|
||||
|
||||
@ -58,7 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
||||
, m_tuneThreadBlockSizes(tuneKernels)
|
||||
, m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
|
||||
{
|
||||
|
||||
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
||||
// Some sanity check
|
||||
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
|
||||
@ -98,7 +98,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
||||
}
|
||||
}
|
||||
|
||||
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
|
||||
|
||||
if (m_tuneThreadBlockSizes) {
|
||||
tuneThreadBlockSizes();
|
||||
@ -117,7 +118,29 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
{
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
const auto ptrs = std::make_pair(v.data(), d.data());
|
||||
|
||||
auto it = m_apply_graphs.find(ptrs);
|
||||
|
||||
if (it == m_apply_graphs.end()) {
|
||||
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
|
||||
|
||||
// The apply functions contains lots of small function calls which call a kernel each
|
||||
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
|
||||
|
||||
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
|
||||
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
|
||||
}
|
||||
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
|
||||
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
}
|
||||
|
||||
@ -142,7 +165,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
else{
|
||||
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
|
||||
@ -154,7 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
@ -165,7 +190,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
d.data(),
|
||||
v.data(),
|
||||
lowerSolveThreadBlockSize);
|
||||
lowerSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
@ -185,7 +211,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiagFloat.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
|
||||
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
|
||||
@ -197,7 +224,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
else{
|
||||
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
|
||||
@ -209,7 +237,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
numOfRowsInLevel,
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
} else {
|
||||
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
@ -219,7 +248,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
v.data(),
|
||||
upperSolveThreadBlockSize);
|
||||
upperSolveThreadBlockSize,
|
||||
m_stream.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -241,10 +271,9 @@ template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmGpuILU0<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
|
||||
}
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
|
||||
reorderAndSplitMatrix(m_moveThreadBlockSize);
|
||||
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
@ -253,13 +282,23 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
{
|
||||
// ensure that this stream only starts doing work when main stream is completed up to this point
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
|
||||
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
|
||||
reorderAndSplitMatrix(moveThreadBlockSize);
|
||||
LUFactorizeMatrix(factorizationThreadBlockSize);
|
||||
|
||||
// ensure that main stream only continues after this stream is completed
|
||||
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
|
||||
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
|
||||
OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
|
||||
{
|
||||
if (m_splitMatrix) {
|
||||
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||
@ -283,6 +322,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
|
||||
m_gpuReorderedLU->N(),
|
||||
moveThreadBlockSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
|
||||
{
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
@ -84,8 +85,11 @@ public:
|
||||
//! \brief Updates the matrix data.
|
||||
void update() final;
|
||||
|
||||
//! \brief perform matrix splitting and reordering
|
||||
void reorderAndSplitMatrix(int moveThreadBlockSize);
|
||||
|
||||
//! \brief Compute LU factorization, and update the data of the reordered matrix
|
||||
void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
|
||||
void LUFactorizeMatrix(int factorizationThreadBlockSize);
|
||||
|
||||
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||
void tuneThreadBlockSizes();
|
||||
@ -152,6 +156,16 @@ private:
|
||||
int m_lowerSolveThreadBlockSize = -1;
|
||||
int m_moveThreadBlockSize = -1;
|
||||
int m_ILU0FactorizationThreadBlockSize = -1;
|
||||
|
||||
// Graphs for Apply
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
|
||||
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
|
||||
|
||||
// Stream for the DILU operations on the GPU
|
||||
GPUStream m_stream{};
|
||||
// Events for synchronization with main stream
|
||||
GPUEvent m_before{};
|
||||
GPUEvent m_after{};
|
||||
};
|
||||
} // end namespace Opm::gpuistl
|
||||
|
||||
|
@ -85,10 +85,12 @@ namespace
|
||||
// TODO: removce the first condition in the for loop
|
||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||
const int col = colIndices[block];
|
||||
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
}
|
||||
|
||||
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,10 +139,12 @@ namespace
|
||||
LinearSolverScalar rhs[blocksize] = {0};
|
||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||
const int col = colIndices[block];
|
||||
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
}
|
||||
|
||||
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
|
||||
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -211,8 +215,8 @@ namespace
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
|
||||
// TOOD: The important part is to only cast after that is fully computed
|
||||
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
|
||||
// results TOOD: The important part is to only cast after that is fully computed
|
||||
template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
|
||||
__global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
|
||||
int* lowerRowIndices,
|
||||
@ -239,7 +243,8 @@ namespace
|
||||
InputScalar dInvTmp[blocksize * blocksize];
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
|
||||
dInvTmp[i * blocksize + j]
|
||||
= srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
|
||||
}
|
||||
}
|
||||
|
||||
@ -257,21 +262,26 @@ namespace
|
||||
|
||||
if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
|
||||
// TODO: think long and hard about whether this performs only the wanted memory transfers
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(
|
||||
&srcReorderedLowerMat[block * blocksize * blocksize],
|
||||
&dstLowerMat[block * blocksize * blocksize]);
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(
|
||||
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||
&dstUpperMat[symOppositeBlock * blocksize * blocksize]);
|
||||
}
|
||||
|
||||
mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
|
||||
&dInv[col * blocksize * blocksize],
|
||||
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||
dInvTmp);
|
||||
&dInv[col * blocksize * blocksize],
|
||||
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||
dInvTmp);
|
||||
}
|
||||
|
||||
invBlockInPlace<InputScalar, blocksize>(dInvTmp);
|
||||
moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
|
||||
|
||||
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(
|
||||
dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
}
|
||||
|
||||
@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
const DiagonalScalar* dInv,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
}
|
||||
// perform the upper solve for all rows in the same level set
|
||||
template <class T, int blocksize>
|
||||
@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||
lowerRowIndices,
|
||||
lowerColIndices,
|
||||
srcReorderedUpperMat,
|
||||
upperRowIndices,
|
||||
upperColIndices,
|
||||
srcDiagonal,
|
||||
reorderedToNatural,
|
||||
naturalToReordered,
|
||||
startIdx,
|
||||
rowsInLevelSet,
|
||||
dInv,
|
||||
dstDiag,
|
||||
dstLowerMat,
|
||||
dstUpperMat);
|
||||
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
|
||||
<<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||
lowerRowIndices,
|
||||
lowerColIndices,
|
||||
srcReorderedUpperMat,
|
||||
upperRowIndices,
|
||||
upperColIndices,
|
||||
srcDiagonal,
|
||||
reorderedToNatural,
|
||||
naturalToReordered,
|
||||
startIdx,
|
||||
rowsInLevelSet,
|
||||
dInv,
|
||||
dstDiag,
|
||||
dstLowerMat,
|
||||
dstUpperMat);
|
||||
} else {
|
||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: format
|
||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
double*, \
|
||||
double*, \
|
||||
double*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
float*, \
|
||||
float*, \
|
||||
float*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
float*, \
|
||||
float*, \
|
||||
float*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
double*, \
|
||||
double*, \
|
||||
double*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
float*, \
|
||||
float*, \
|
||||
float*, \
|
||||
int); \
|
||||
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const T*, \
|
||||
int*, \
|
||||
int*, \
|
||||
const int, \
|
||||
int, \
|
||||
T*, \
|
||||
double*, \
|
||||
double*, \
|
||||
double*, \
|
||||
int); \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
|
||||
template void solveLowerLevelSet<T, blocksize>( \
|
||||
T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
|
||||
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||
@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
|
||||
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \
|
||||
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
|
||||
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \
|
||||
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
|
||||
MatrixScalar*, \
|
||||
int*, \
|
||||
int*, \
|
||||
int*, \
|
||||
int, \
|
||||
int, \
|
||||
const DiagonalScalar*, \
|
||||
const LinearSolverScalar*, \
|
||||
LinearSolverScalar*, \
|
||||
int, \
|
||||
cudaStream_t);
|
||||
|
||||
// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
|
||||
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
|
||||
|
||||
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
|
||||
|
@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
|
||||
const DiagonalScalar* dInv,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
||||
|
@ -215,9 +215,9 @@ namespace
|
||||
// as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
|
||||
// and sometimes also the diagonal.
|
||||
if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
|
||||
// if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
|
||||
// if not then we just use the double diagonal that is already now stored in srcDiagonal
|
||||
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
|
||||
// if we are want to store the entire matrix as a float then we must also move the diagonal block from
|
||||
// double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
|
||||
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
|
||||
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
|
||||
&dstDiagonal[reorderedIdx * scalarsInBlock]);
|
||||
}
|
||||
@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* d,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||
}
|
||||
// perform the upper solve for all rows in the same level set
|
||||
@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
|
||||
}
|
||||
|
||||
@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||
}
|
||||
// perform the upper solve for all rows in the same level set
|
||||
template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
|
||||
@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int thrBlockSize)
|
||||
int thrBlockSize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
|
||||
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
}
|
||||
|
||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
|
||||
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
|
||||
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
|
||||
|
||||
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
|
||||
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
|
||||
INSTANTIATE_KERNEL_WRAPPERS(T, 6);
|
||||
|
||||
INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
|
||||
@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
|
||||
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
|
||||
/* double preconditioner */ \
|
||||
template void solveLowerLevelSetSplit<blocksize, double, double>( \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float matrix, double compute preconditioner */ \
|
||||
template void solveLowerLevelSetSplit<blocksize, double, float>( \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float preconditioner */ \
|
||||
template void solveLowerLevelSetSplit<blocksize, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int); \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); \
|
||||
\
|
||||
/* double preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
|
||||
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float matrix, double compute preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
|
||||
/* float preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, float*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
|
||||
float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t); \
|
||||
/* double preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
|
||||
double*, int*, int*, int*, int, int, const float*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
|
||||
double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
|
||||
/* float matrix, double compute preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, double*, int); \
|
||||
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
|
||||
/* float preconditioner */ \
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int);
|
||||
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
|
||||
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);
|
||||
|
||||
|
||||
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
|
||||
|
@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const T* d,
|
||||
T* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
|
||||
int rowsInLevelSet,
|
||||
const DiagonalScalar* dInv,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
|
||||
int rowsInLevelSet,
|
||||
const LinearSolverScalar* d,
|
||||
LinearSolverScalar* v,
|
||||
int threadBlockSize);
|
||||
int threadBlockSize,
|
||||
cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
|
||||
|
Loading…
Reference in New Issue
Block a user