Merge pull request #5852 from multitalentloes/cudagraph_experiment

Improve gpuistl using cudaGraphs
This commit is contained in:
Kjetil Olsen Lye 2025-02-12 15:01:20 +01:00 committed by GitHub
commit b99b606575
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 417 additions and 149 deletions

View File

@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
{
copyFromHost(data.data(), data.size());
}
template <class T>
void
GpuBuffer<T>::copyToHost(std::vector<T>& data) const

View File

@ -27,6 +27,7 @@
#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
#include <vector>
#include <string>
#include <cuda_runtime.h>
namespace Opm::gpuistl

View File

@ -37,6 +37,7 @@
#include <functional>
#include <utility>
#include <string>
namespace Opm::gpuistl
{
@ -93,7 +94,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
}
}
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);
if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
@ -110,10 +112,31 @@ template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
{
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
OPM_TIMEBLOCK(prec_apply);
{
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
const auto ptrs = std::make_pair(v.data(), d.data());
auto it = m_apply_graphs.find(ptrs);
if (it == m_apply_graphs.end()) {
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
// The apply functions contains lots of small function calls which call a kernel each
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
}
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
template <class M, class X, class Y, int l>
@ -135,7 +158,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInvFloat->data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@ -147,7 +171,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@ -159,7 +184,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@ -172,7 +198,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
levelStartIdx += numOfRowsInLevel;
}
@ -193,7 +220,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInvFloat->data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@ -204,7 +232,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
} else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@ -215,7 +244,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@ -227,7 +257,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
}
}
@ -252,7 +283,9 @@ GpuDILU<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);
{
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);
}
}
@ -260,13 +293,22 @@ template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
{
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(moveThreadBlockSize);
computeDiagonal(factorizationBlockSize);
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -290,7 +332,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
m_gpuMatrixReordered->N(),
moveThreadBlockSize);
}
}
template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();

View File

@ -24,8 +24,10 @@
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
#include <vector>
#include <map>
#include <utility>
namespace Opm::gpuistl
@ -82,8 +84,11 @@ public:
//! \brief Updates the matrix data.
void update() final;
//! \brief perform matrix splitting and reordering
void reorderAndSplitMatrix(int moveThreadBlockSize);
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
void computeDiagonal(int factorizationThreadBlockSize);
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
@ -153,6 +158,16 @@ private:
int m_lowerSolveThreadBlockSize = -1;
int m_moveThreadBlockSize = -1;
int m_DILUFactorizationThreadBlockSize = -1;
// Graphs for Apply
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
// Stream for the DILU operations on the GPU
GPUStream m_stream{};
// Events for synchronization with main stream
GPUEvent m_before{};
GPUEvent m_after{};
};
} // end namespace Opm::gpuistl

View File

@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
}
template <class T>
void
GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
{
if (numberOfElements > dim()) {
OPM_THROW(std::runtime_error,
fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
dim(),
numberOfElements));
}
OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
}
template <class T>
void
GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const

View File

@ -203,6 +203,7 @@ public:
* @note assumes that this vector has numberOfElements elements
*/
void copyFromHost(const T* dataPointer, size_t numberOfElements);
void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
/**
* @brief copyFromHost copies numberOfElements to the CPU memory dataPointer

View File

@ -37,6 +37,7 @@
#include <string>
#include <tuple>
#include <utility>
namespace Opm::gpuistl
{
@ -58,7 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
, m_tuneThreadBlockSizes(tuneKernels)
, m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
{
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@ -98,7 +98,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
}
}
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
reorderAndSplitMatrix(m_moveThreadBlockSize);
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
@ -117,7 +118,29 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
{
OPM_TIMEBLOCK(prec_apply);
{
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
const auto ptrs = std::make_pair(v.data(), d.data());
auto it = m_apply_graphs.find(ptrs);
if (it == m_apply_graphs.end()) {
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
// The apply functions contains lots of small function calls which call a kernel each
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
}
@ -142,7 +165,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
else{
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@ -154,7 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -165,7 +190,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
levelStartIdx += numOfRowsInLevel;
}
@ -185,7 +211,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
m_gpuMatrixReorderedDiagFloat.value().data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@ -197,7 +224,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
else{
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@ -209,7 +237,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -219,7 +248,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
levelStartIdx,
numOfRowsInLevel,
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
}
}
@ -241,10 +271,9 @@ template <class M, class X, class Y, int l>
void
OpmGpuILU0<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);
{
update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
}
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(m_moveThreadBlockSize);
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
}
template <class M, class X, class Y, int l>
@ -253,13 +282,23 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
{
OPM_TIMEBLOCK(prec_update);
{
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
reorderAndSplitMatrix(moveThreadBlockSize);
LUFactorizeMatrix(factorizationThreadBlockSize);
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
}
template <class M, class X, class Y, int l>
void
OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -283,6 +322,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
m_gpuReorderedLU->N(),
moveThreadBlockSize);
}
}
template <class M, class X, class Y, int l>
void
OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();

View File

@ -24,6 +24,7 @@
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <optional>
#include <type_traits>
@ -84,8 +85,11 @@ public:
//! \brief Updates the matrix data.
void update() final;
//! \brief perform matrix splitting and reordering
void reorderAndSplitMatrix(int moveThreadBlockSize);
//! \brief Compute LU factorization, and update the data of the reordered matrix
void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
void LUFactorizeMatrix(int factorizationThreadBlockSize);
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
@ -152,6 +156,16 @@ private:
int m_lowerSolveThreadBlockSize = -1;
int m_moveThreadBlockSize = -1;
int m_ILU0FactorizationThreadBlockSize = -1;
// Graphs for Apply
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
// Stream for the DILU operations on the GPU
GPUStream m_stream{};
// Events for synchronization with main stream
GPUEvent m_before{};
GPUEvent m_after{};
};
} // end namespace Opm::gpuistl

View File

@ -85,10 +85,12 @@ namespace
// TODO: removce the first condition in the for loop
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
@ -137,10 +139,12 @@ namespace
LinearSolverScalar rhs[blocksize] = {0};
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
@ -211,8 +215,8 @@ namespace
}
}
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
// TOOD: The important part is to only cast after that is fully computed
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
// results TOOD: The important part is to only cast after that is fully computed
template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
__global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
@ -239,7 +243,8 @@ namespace
InputScalar dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
dInvTmp[i * blocksize + j]
= srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
}
}
@ -257,21 +262,26 @@ namespace
if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
// TODO: think long and hard about whether this performs only the wanted memory transfers
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(
&srcReorderedLowerMat[block * blocksize * blocksize],
&dstLowerMat[block * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
&dstUpperMat[symOppositeBlock * blocksize * blocksize]);
}
mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
&dInv[col * blocksize * blocksize],
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<InputScalar, blocksize>(dInvTmp);
moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
moveBlock<blocksize, InputScalar, OutputScalar>(
dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
}
}
}
@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
const T* dInv,
const T* d,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
const DiagonalScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
// perform the upper solve for all rows in the same level set
template <class T, int blocksize>
@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
srcReorderedUpperMat,
upperRowIndices,
upperColIndices,
srcDiagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv,
dstDiag,
dstLowerMat,
dstUpperMat);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
<<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
srcReorderedUpperMat,
upperRowIndices,
upperColIndices,
srcDiagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv,
dstDiag,
dstLowerMat,
dstUpperMat);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
// TODO: format
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
double*, \
double*, \
double*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
float*, \
float*, \
float*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
float*, \
float*, \
float*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
double*, \
double*, \
double*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
float*, \
float*, \
float*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
double*, \
double*, \
double*, \
int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
template void solveLowerLevelSet<T, blocksize>( \
T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, \
int*, \
int*, \
int*, \
int, \
int, \
const DiagonalScalar*, \
const LinearSolverScalar*, \
LinearSolverScalar*, \
int, \
cudaStream_t);
// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);

View File

@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
const T* dInv,
const T* d,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
const DiagonalScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector

View File

@ -215,9 +215,9 @@ namespace
// as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
// and sometimes also the diagonal.
if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
// if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
// if not then we just use the double diagonal that is already now stored in srcDiagonal
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
// if we are want to store the entire matrix as a float then we must also move the diagonal block from
// double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
&dstDiagonal[reorderedIdx * scalarsInBlock]);
}
@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* d,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
}
// perform the upper solve for all rows in the same level set
@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
int startIdx,
int rowsInLevelSet,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
}
@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
}
// perform the upper solve for all rows in the same level set
template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
}
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
INSTANTIATE_KERNEL_WRAPPERS(T, 6);
INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
/* double preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int); \
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float matrix, double compute preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, double, float>( \
float*, int*, int*, int*, int, int, const double*, double*, int); \
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int); \
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); \
\
/* double preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
float*, int*, int*, int*, int, int, const double*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
float*, int*, int*, int*, int, int, const double*, float*, int); \
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t); \
/* double preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
double*, int*, int*, int*, int, int, const float*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
/* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
float*, int*, int*, int*, int, int, const float*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
/* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int);
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);

View File

@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
int startIdx,
int rowsInLevelSet,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* d,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
int rowsInLevelSet,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal