clang-format ILUKernels.cu

This commit is contained in:
Tobias Meyer Andersen 2024-10-02 14:59:29 +02:00
parent e82a9fa56c
commit 89df54e379

View File

@ -159,10 +159,11 @@ namespace
// the DUNE implementation is inplace, and I need to take care to make sure // the DUNE implementation is inplace, and I need to take care to make sure
// this out of place implementation is equivalent // this out of place implementation is equivalent
mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock], mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
&srcDiagonal[j * scalarsInBlock], &srcDiagonal[j * scalarsInBlock],
&srcReorderedLowerMat[ij * scalarsInBlock]); &srcReorderedLowerMat[ij * scalarsInBlock]);
if (copyResultToOtherMatrix){ if (copyResultToOtherMatrix) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock], &dstReorderedLowerMat[ij * scalarsInBlock]); moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock],
&dstReorderedLowerMat[ij * scalarsInBlock]);
} }
// we have now accounted for an element under the diagonal and the diagonal element above it // we have now accounted for an element under the diagonal and the diagonal element above it
@ -202,8 +203,9 @@ namespace
// A_jk = A_ij * A_jk // A_jk = A_ij * A_jk
InputScalar tmp[scalarsInBlock] = {0}; InputScalar tmp[scalarsInBlock] = {0};
mmNoOverlap<InputScalar, blocksize>( mmNoOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
&srcReorderedLowerMat[ij * scalarsInBlock], &srcReorderedUpperMat[jk * scalarsInBlock], tmp); &srcReorderedUpperMat[jk * scalarsInBlock],
tmp);
matrixSubtraction<InputScalar, blocksize>(ikBlockPtr, tmp); matrixSubtraction<InputScalar, blocksize>(ikBlockPtr, tmp);
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper); incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
++jk; ++jk;
@ -217,12 +219,14 @@ namespace
} }
} }
invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]); invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]);
if (copyResultToOtherMatrix){ if (copyResultToOtherMatrix) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock], &dstDiagonal[reorderedIdx * scalarsInBlock]); moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
&dstDiagonal[reorderedIdx * scalarsInBlock]);
// also move all values above the diagonal on this row // also move all values above the diagonal on this row
for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) { for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) {
moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock], &dstReorderedUpperMat[block * scalarsInBlock]); moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock],
&dstReorderedUpperMat[block * scalarsInBlock]);
} }
} }
} }
@ -340,10 +344,12 @@ namespace
for (int block = nnzIdx; block < nnzIdxLim; ++block) { for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block]; const int col = colIndices[block];
mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
} }
mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
} }
} }
} // namespace } // namespace
@ -459,36 +465,37 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int rowsInLevelSet, int rowsInLevelSet,
int thrBlockSize) int thrBlockSize)
{ {
int threadBlockSize int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize); cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat, cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>
lowerRowIndices, <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerColIndices, lowerRowIndices,
reorderedUpperMat, lowerColIndices,
upperRowIndices, reorderedUpperMat,
upperColIndices, upperRowIndices,
srcDiagonal, upperColIndices,
dstReorderedLowerMat, srcDiagonal,
dstReorderedUpperMat, dstReorderedLowerMat,
dstDiagonal, dstReorderedUpperMat,
reorderedToNatural, dstDiagonal,
naturalToReordered, reorderedToNatural,
startIdx, naturalToReordered,
rowsInLevelSet); startIdx,
rowsInLevelSet);
} }
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \ template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \ template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
template void LUFactorizationSplit<blocksize, T, float, true>( \ template void LUFactorizationSplit<blocksize, T, float, true>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, true>( \ template void LUFactorizationSplit<blocksize, T, double, true>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, false>( \ template void LUFactorizationSplit<blocksize, T, float, false>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, false>( \ template void LUFactorizationSplit<blocksize, T, double, false>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 1);
@ -504,20 +511,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6); INSTANTIATE_KERNEL_WRAPPERS(double, 6);
#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
/* double preconditioner */\ /* double preconditioner */ \
template void solveLowerLevelSetSplit<blocksize, double, double>(double*, int*, int*, int*, int, int, const double*, double*, int); \ template void solveLowerLevelSetSplit<blocksize, double, double>( \
/* float matrix, double compute preconditioner */\ double*, int*, int*, int*, int, int, const double*, double*, int); \
template void solveLowerLevelSetSplit<blocksize, double, float>(float*, int*, int*, int*, int, int, const double*, double*, int); \ /* float matrix, double compute preconditioner */ \
/* float preconditioner */\ template void solveLowerLevelSetSplit<blocksize, double, float>( \
template void solveLowerLevelSetSplit<blocksize, float, float>(float*, int*, int*, int*, int, int, const float*, float*, int); \ float*, int*, int*, int*, int, int, const double*, double*, int); \
\ /* float preconditioner */ \
/* double preconditioner */\ template void solveLowerLevelSetSplit<blocksize, float, float>( \
template void solveUpperLevelSetSplit<blocksize, double, double>(double*, int*, int*, int*, int, int, const double*, double*, int); \ float*, int*, int*, int*, int, int, const float*, float*, int); \
/* float matrix, double compute preconditioner */\ \
template void solveUpperLevelSetSplit<blocksize, double, float>(float*, int*, int*, int*, int, int, const float*, double*, int); \ /* double preconditioner */ \
/* float preconditioner */\ template void solveUpperLevelSetSplit<blocksize, double, double>( \
template void solveUpperLevelSetSplit<blocksize, float, float>(float*, int*, int*, int*, int, int, const float*, float*, int); \ double*, int*, int*, int*, int, int, const double*, double*, int); \
/* float matrix, double compute preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, double, float>( \
float*, int*, int*, int*, int, int, const float*, double*, int); \
/* float preconditioner */ \
template void solveUpperLevelSetSplit<blocksize, float, float>( \
float*, int*, int*, int*, int, int, const float*, float*, int);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2);