use cudagraphs in gpu DILU and ILU0 apply

expected to give performance gain on current
nvidia cards, no significant change on AMD cards
for cases of reasonable size

uses new GPU convenence objects
This commit is contained in:
Tobias Meyer Andersen 2024-12-05 10:47:05 +01:00
parent 2babeb848d
commit 66fa971079
12 changed files with 417 additions and 149 deletions

View File

@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
{
copyFromHost(data.data(), data.size());
}
template <class T>
void
GpuBuffer<T>::copyToHost(std::vector<T>& data) const

View File

@ -27,6 +27,7 @@
#include <opm/simulators/linalg/gpuistl/GpuView.hpp>
#include <vector>
#include <string>
#include <cuda_runtime.h>
namespace Opm::gpuistl

View File

@ -37,6 +37,7 @@
#include <functional>
#include <utility>
#include <string>
namespace Opm::gpuistl
{
@ -93,7 +94,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
}
}
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);
if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
@ -110,10 +112,31 @@ template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
{
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
OPM_TIMEBLOCK(prec_apply);
{
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
const auto ptrs = std::make_pair(v.data(), d.data());
auto it = m_apply_graphs.find(ptrs);
if (it == m_apply_graphs.end()) {
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
// The apply functions contains lots of small function calls which call a kernel each
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
}
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
template <class M, class X, class Y, int l>
@ -135,7 +158,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInvFloat->data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@ -147,7 +171,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@ -159,7 +184,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@ -172,7 +198,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
levelStartIdx += numOfRowsInLevel;
}
@ -193,7 +220,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInvFloat->data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@ -204,7 +232,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
} else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@ -215,7 +244,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@ -227,7 +257,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
}
}
@ -252,7 +283,9 @@ GpuDILU<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);
{
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);
}
}
@ -260,13 +293,22 @@ template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
{
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(moveThreadBlockSize);
computeDiagonal(factorizationBlockSize);
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -290,7 +332,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
m_gpuMatrixReordered->N(),
moveThreadBlockSize);
}
}
template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();

View File

@ -24,8 +24,10 @@
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
#include <vector>
#include <map>
#include <utility>
namespace Opm::gpuistl
@ -82,8 +84,11 @@ public:
//! \brief Updates the matrix data.
void update() final;
//! \brief perform matrix splitting and reordering
void reorderAndSplitMatrix(int moveThreadBlockSize);
//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
void computeDiagonal(int factorizationThreadBlockSize);
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
@ -153,6 +158,16 @@ private:
int m_lowerSolveThreadBlockSize = -1;
int m_moveThreadBlockSize = -1;
int m_DILUFactorizationThreadBlockSize = -1;
// Graphs for Apply
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
// Stream for the DILU operations on the GPU
GPUStream m_stream{};
// Events for synchronization with main stream
GPUEvent m_before{};
GPUEvent m_after{};
};
} // end namespace Opm::gpuistl

View File

@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
}
template <class T>
void
GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
{
if (numberOfElements > dim()) {
OPM_THROW(std::runtime_error,
fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
dim(),
numberOfElements));
}
OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
}
template <class T>
void
GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const

View File

@ -203,6 +203,7 @@ public:
* @note assumes that this vector has numberOfElements elements
*/
void copyFromHost(const T* dataPointer, size_t numberOfElements);
void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);
/**
* @brief copyFromHost copies numberOfElements to the CPU memory dataPointer

View File

@ -37,6 +37,7 @@
#include <string>
#include <tuple>
#include <utility>
namespace Opm::gpuistl
{
@ -58,7 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
, m_tuneThreadBlockSizes(tuneKernels)
, m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
{
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@ -98,7 +98,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
}
}
LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
reorderAndSplitMatrix(m_moveThreadBlockSize);
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
@ -117,7 +118,29 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
{
OPM_TIMEBLOCK(prec_apply);
{
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
const auto ptrs = std::make_pair(v.data(), d.data());
auto it = m_apply_graphs.find(ptrs);
if (it == m_apply_graphs.end()) {
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
// The apply functions contains lots of small function calls which call a kernel each
apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
}
OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
}
@ -142,7 +165,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
else{
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@ -154,7 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -165,7 +190,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream.get());
}
levelStartIdx += numOfRowsInLevel;
}
@ -185,7 +211,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
m_gpuMatrixReorderedDiagFloat.value().data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@ -197,7 +224,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
else{
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@ -209,7 +237,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
} else {
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -219,7 +248,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
levelStartIdx,
numOfRowsInLevel,
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream.get());
}
}
}
@ -241,10 +271,9 @@ template <class M, class X, class Y, int l>
void
OpmGpuILU0<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);
{
update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
}
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
reorderAndSplitMatrix(m_moveThreadBlockSize);
LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
}
template <class M, class X, class Y, int l>
@ -253,13 +282,23 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
{
OPM_TIMEBLOCK(prec_update);
{
// ensure that this stream only starts doing work when main stream is completed up to this point
OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
reorderAndSplitMatrix(moveThreadBlockSize);
LUFactorizeMatrix(factorizationThreadBlockSize);
// ensure that main stream only continues after this stream is completed
OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
}
}
template <class M, class X, class Y, int l>
void
OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -283,6 +322,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
m_gpuReorderedLU->N(),
moveThreadBlockSize);
}
}
template <class M, class X, class Y, int l>
void
OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();

View File

@ -24,6 +24,7 @@
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
#include <optional>
#include <type_traits>
@ -84,8 +85,11 @@ public:
//! \brief Updates the matrix data.
void update() final;
//! \brief perform matrix splitting and reordering
void reorderAndSplitMatrix(int moveThreadBlockSize);
//! \brief Compute LU factorization, and update the data of the reordered matrix
void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
void LUFactorizeMatrix(int factorizationThreadBlockSize);
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
@ -152,6 +156,16 @@ private:
int m_lowerSolveThreadBlockSize = -1;
int m_moveThreadBlockSize = -1;
int m_ILU0FactorizationThreadBlockSize = -1;
// Graphs for Apply
std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
// Stream for the DILU operations on the GPU
GPUStream m_stream{};
// Events for synchronization with main stream
GPUEvent m_before{};
GPUEvent m_after{};
};
} // end namespace Opm::gpuistl

View File

@ -85,10 +85,12 @@ namespace
// TODO: removce the first condition in the for loop
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
@ -137,10 +139,12 @@ namespace
LinearSolverScalar rhs[blocksize] = {0};
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
@ -211,8 +215,8 @@ namespace
}
}
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
// TOOD: The important part is to only cast after that is fully computed
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
// results TOOD: The important part is to only cast after that is fully computed
template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
__global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
@ -239,7 +243,8 @@ namespace
InputScalar dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
dInvTmp[i * blocksize + j]
= srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
}
}
@ -257,21 +262,26 @@ namespace
if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
// TODO: think long and hard about whether this performs only the wanted memory transfers
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(
&srcReorderedLowerMat[block * blocksize * blocksize],
&dstLowerMat[block * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
&dstUpperMat[symOppositeBlock * blocksize * blocksize]);
}
mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
&dInv[col * blocksize * blocksize],
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<InputScalar, blocksize>(dInvTmp);
moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
moveBlock<blocksize, InputScalar, OutputScalar>(
dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
}
}
}
@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
const T* dInv,
const T* d,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
const DiagonalScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
// perform the upper solve for all rows in the same level set
template <class T, int blocksize>
@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
srcReorderedUpperMat,
upperRowIndices,
upperColIndices,
srcDiagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv,
dstDiag,
dstLowerMat,
dstUpperMat);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
<<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
srcReorderedUpperMat,
upperRowIndices,
upperColIndices,
srcDiagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv,
dstDiag,
dstLowerMat,
dstUpperMat);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
// TODO: format
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
double*, \
double*, \
double*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
float*, \
float*, \
float*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
float*, \
float*, \
float*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
double*, \
double*, \
double*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
float*, \
float*, \
float*, \
int); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const T*, \
int*, \
int*, \
const int, \
int, \
T*, \
double*, \
double*, \
double*, \
int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
template void solveLowerLevelSet<T, blocksize>( \
T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, \
int*, \
int*, \
int*, \
int, \
int, \
const DiagonalScalar*, \
const LinearSolverScalar*, \
LinearSolverScalar*, \
int, \
cudaStream_t);
// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);

View File

@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
const T* dInv,
const T* d,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
const DiagonalScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector

View File

@ -215,9 +215,9 @@ namespace
// as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
// and sometimes also the diagonal.
if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
// if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
// if not then we just use the double diagonal that is already now stored in srcDiagonal
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
// if we are want to store the entire matrix as a float then we must also move the diagonal block from
// double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
&dstDiagonal[reorderedIdx * scalarsInBlock]);
}
@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* d,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
}
// perform the upper solve for all rows in the same level set
@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
int startIdx,
int rowsInLevelSet,
T* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
}
@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
}
// perform the upper solve for all rows in the same level set
template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int thrBlockSize)
int thrBlockSize,
cudaStream_t stream)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
<<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
}
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
INSTANTIATE_KERNEL_WRAPPERS(T, 6);
INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
/* double preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int); \
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float matrix, double compute preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, double, float>( \
float*, int*, int*, int*, int, int, const double*, double*, int); \
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int); \
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t); \
\
/* double preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, double, double>( \
double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
float*, int*, int*, int*, int, int, const double*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, float, double>( \
float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t); \
/* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
float*, int*, int*, int*, int, int, const double*, float*, int); \
template void solveUpperLevelSetSplit<blocksize, float, float, double>( \
float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t); \
/* double preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
double*, int*, int*, int*, int, int, const float*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, double, float>( \
double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
/* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
float*, int*, int*, int*, int, int, const float*, double*, int); \
template void solveUpperLevelSetSplit<blocksize, double, float, float>( \
float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t); \
/* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int);
template void solveUpperLevelSetSplit<blocksize, float, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);

View File

@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
int startIdx,
int rowsInLevelSet,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
int rowsInLevelSet,
const T* d,
T* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int rowsInLevelSet,
const DiagonalScalar* dInv,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
int rowsInLevelSet,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int threadBlockSize);
int threadBlockSize,
cudaStream_t stream);
/**
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal