implement PR feedback

This commit is contained in:
Tobias Meyer Andersen 2024-10-02 14:57:34 +02:00
parent 54c2f2df30
commit e82a9fa56c
6 changed files with 58 additions and 40 deletions

View File

@ -82,8 +82,8 @@ GpuSparseMatrix<T>::GpuSparseMatrix(const T* nonZeroElements,
}
template <class T>
GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int> rowIndices,
const GpuVector<int> columnIndices,
GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int>& rowIndices,
const GpuVector<int>& columnIndices,
size_t blockSize)
: m_nonZeroElements(columnIndices.dim() * blockSize * blockSize)
, m_columnIndices(columnIndices)

View File

@ -77,8 +77,8 @@ public:
//!
//! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to
//! restrictions in the current version of cusparse. This might change in future versions.
GpuSparseMatrix(const GpuVector<int> rowIndices,
const GpuVector<int> columnIndices,
GpuSparseMatrix(const GpuVector<int>& rowIndices,
const GpuVector<int>& columnIndices,
size_t blockSize);
/**

View File

@ -86,10 +86,11 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
}
if (m_storeFactorizationAsFloat){
assert(m_splitMatrix && "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
// initialize mixed precision datastructures
m_gpuMatrixReorderedLowerFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_)));
m_gpuMatrixReorderedUpperFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_)));
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
}
@ -271,23 +272,42 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data() : nullptr,
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data() : nullptr,
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedDiagFloat.value().data() : nullptr,
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_storeFactorizationAsFloat,
factorizationThreadBlockSize);
if (m_storeFactorizationAsFloat){
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedDiagFloat.value().data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
factorizationThreadBlockSize);
}
else{
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
nullptr,
nullptr,
nullptr,
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
factorizationThreadBlockSize);
}
} else {
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),

View File

@ -93,7 +93,7 @@ invBlockInPlace(T* __restrict__ block)
T t14 = block[2] * block[6];
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
T t17 = T(T(1.0)) / det;
T t17 = T(1.0) / det;
T matrix01 = block[1];
T matrix00 = block[0];

View File

@ -120,7 +120,7 @@ namespace
}
}
template <int blocksize, class InputScalar, class OutputScalar>
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
__global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
@ -134,7 +134,6 @@ namespace
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
const bool copyResultToOtherMatrix,
int rowsInLevelSet)
{
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
@ -442,7 +441,7 @@ LUFactorization(T* srcMatrix,
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
}
template <int blocksize, class InputScalar, class OutputScalar>
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
void
LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
@ -458,13 +457,12 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
bool copyResultToOtherMatrix,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar>, thrBlockSize);
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
reorderedUpperMat,
@ -477,7 +475,6 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
reorderedToNatural,
naturalToReordered,
startIdx,
copyResultToOtherMatrix,
rowsInLevelSet);
}
@ -485,12 +482,14 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
template void LUFactorizationSplit<blocksize, T, float>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, bool, int); \
template void LUFactorizationSplit<blocksize, T, double>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, bool, int);
// template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
// template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
template void LUFactorizationSplit<blocksize, T, float, true>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, true>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, float, false>( \
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
template void LUFactorizationSplit<blocksize, T, double, false>( \
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);

View File

@ -176,7 +176,7 @@ void LUFactorization(T* reorderedMat,
* function
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <int blocksize, class InputScalar, class OutputScalar>
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
@ -191,7 +191,6 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
bool copyResultToOtherMatrix,
int threadBlockSize);
} // namespace Opm::gpuistl::detail::ILU0