working MP for gpu DILU

This commit is contained in:
Tobias Meyer Andersen 2024-10-15 13:22:55 +02:00
parent 1d49eadd15
commit 3d67191d49
5 changed files with 231 additions and 108 deletions

View File

@ -83,10 +83,13 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
}
if (m_storeFactorizationAsFloat) {
OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
if (!m_splitMatrix){
OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
}
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
}
computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@ -120,17 +123,31 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
if (m_storeFactorizationAsFloat) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInvFloat->data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
}
} else {
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
m_gpuMatrixReordered->getNonZeroValues().data(),
@ -153,16 +170,29 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) {
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
if (m_storeFactorizationAsFloat){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInvFloat->data(),
v.data(),
upperSolveThreadBlockSize);
} else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
}
} else {
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
m_gpuMatrixReordered->getNonZeroValues().data(),
@ -241,20 +271,44 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag->data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
factorizationBlockSize);
if (m_storeFactorizationAsFloat) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag->data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
m_gpuDInvFloat->data(),
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize);
} else {
// TODO: should this be field type twice or field type then float in the template?
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag->data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
nullptr,
nullptr,
nullptr,
factorizationBlockSize);
}
} else {
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
m_gpuMatrixReordered->getNonZeroValues().data(),

View File

@ -133,6 +133,7 @@ private:
std::unique_ptr<FloatMat> m_gpuMatrixReorderedLowerFloat;
std::unique_ptr<FloatMat> m_gpuMatrixReorderedUpperFloat;
std::unique_ptr<FloatVec> m_gpuMatrixReorderedDiagFloat;
std::unique_ptr<FloatVec> m_gpuDInvFloat;
//! row conversion from natural to reordered matrix indices stored on the GPU
GpuVector<int> m_gpuNaturalToReorder;
//! row conversion from reordered to natural matrix indices stored on the GPU

View File

@ -159,7 +159,7 @@ mmv(const T* a, const T* b, T* c)
// dst -= A*B*C
template <class T, int blocksize>
__device__ __forceinline__ void
mmx2Subtraction(T* A, T* B, T* C, T* dst)
mmx2Subtraction(const T* A, const T* B, const T* C, T* dst)
{
T tmp[blocksize * blocksize] = {0};
@ -256,6 +256,19 @@ mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
}
}
// TODO: consider merging with existing block operations
// mixed precision general version of c += Ab
template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
__device__ __forceinline__ void
umvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
{
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
}
}
}
// TODO: consider merging with existing block operations
// Mixed precision general version of c -= Ab
template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>

View File

@ -59,16 +59,16 @@ namespace
}
}
template <class T, int blocksize>
__global__ void cuSolveLowerLevelSetSplit(T* mat,
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
__global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v)
const MatrixScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -77,7 +77,7 @@ namespace
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize];
LinearSolverScalar rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
@ -85,10 +85,10 @@ namespace
// TODO: removce the first condition in the for loop
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
@ -118,15 +118,15 @@ namespace
}
}
template <class T, int blocksize>
__global__ void cuSolveUpperLevelSetSplit(T* mat,
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
__global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v)
const MatrixScalar* dInv,
LinearSolverScalar* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -134,13 +134,13 @@ namespace
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize] = {0};
LinearSolverScalar rhs[blocksize] = {0};
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
@ -211,19 +211,24 @@ namespace
}
}
template <class T, int blocksize>
__global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
// TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
// TOOD: The important part is to only cast after that is fully computed
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
__global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
const InputScalar* srcReorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
const InputScalar* srcDiagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv)
InputScalar* dInv,
OutputScalar* dstDiag, // TODO: should this be diag or dInv?
OutputScalar* dstLowerMat,
OutputScalar* dstUpperMat)
{
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -231,10 +236,10 @@ namespace
const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
T dInvTmp[blocksize * blocksize];
InputScalar dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
}
}
@ -250,18 +255,28 @@ namespace
const int symOppositeBlock = symOppositeIdx;
mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
if constexpr (copyResultToOtherMatrix) {
// TODO: think long and hard about whether this performs only the wanted memory transfers
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
}
mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<T, blocksize>(dInvTmp);
invBlockInPlace<InputScalar, blocksize>(dInvTmp);
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
}
// for (int i = 0; i < blocksize; ++i) {
// for (int j = 0; j < blocksize; ++j) {
// dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
// }
// }
moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
if constexpr (copyResultToOtherMatrix) {
moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
}
}
}
@ -289,23 +304,23 @@ solveLowerLevelSet(T* reorderedMat,
}
template <class T, int blocksize>
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
void
solveLowerLevelSetSplit(T* reorderedMat,
solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
const MatrixScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int thrBlockSize)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
// perform the upper solve for all rows in the same level set
@ -328,22 +343,22 @@ solveUpperLevelSet(T* reorderedMat,
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
void
solveUpperLevelSetSplit(T* reorderedMat,
solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
const MatrixScalar* dInv,
LinearSolverScalar* v,
int thrBlockSize)
{
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
@ -376,51 +391,62 @@ computeDiluDiagonal(T* reorderedMat,
}
}
template <class T, int blocksize>
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
void
computeDiluDiagonalSplit(T* reorderedLowerMat,
computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
const InputScalar* srcReorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
const InputScalar* srcDiagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv,
InputScalar* dInv,
OutputScalar* dstDiag,
OutputScalar* dstLowerMat,
OutputScalar* dstUpperMat,
int thrBlockSize)
{
if (blocksize <= 3) {
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
reorderedUpperMat,
srcReorderedUpperMat,
upperRowIndices,
upperColIndices,
diagonal,
srcDiagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv);
dInv,
dstDiag,
dstLowerMat,
dstUpperMat);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
// TODO: format
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<T, blocksize>( \
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, false>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, false>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, float, true>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
template void computeDiluDiagonalSplit<blocksize, T, double, true>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@ -434,4 +460,30 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float);
} // namespace Opm::gpuistl::detail::DILU

View File

@ -71,16 +71,16 @@ void solveLowerLevelSet(T* reorderedMat,
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
*/
template <class T, int blocksize>
void solveLowerLevelSetSplit(T* reorderedUpperMat,
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
const MatrixScalar* dInv,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int threadBlockSize);
/**
@ -124,15 +124,15 @@ void solveUpperLevelSet(T* reorderedMat,
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
*/
template <class T, int blocksize>
void solveUpperLevelSetSplit(T* reorderedUpperMat,
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
const MatrixScalar* dInv,
LinearSolverScalar* v,
int threadBlockSize);
/**
@ -161,7 +161,6 @@ void computeDiluDiagonal(T* reorderedMat,
int rowsInLevelSet,
T* dInv,
int threadBlockSize);
template <class T, int blocksize>
/**
* @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
@ -184,18 +183,22 @@ template <class T, int blocksize>
* function
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
*/
void computeDiluDiagonalSplit(T* reorderedLowerMat,
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
const InputScalar* srcReorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
const InputScalar* srcDiagonal,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
T* dInv,
InputScalar* dInv,
OutputScalar* dstDiagonal,
OutputScalar* dstLowerMat,
OutputScalar* dstUpperMat,
int threadBlockSize);
} // namespace Opm::gpuistl::detail::DILU