mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
implement PR feedback
This commit is contained in:
parent
54c2f2df30
commit
e82a9fa56c
@ -82,8 +82,8 @@ GpuSparseMatrix<T>::GpuSparseMatrix(const T* nonZeroElements,
|
||||
}
|
||||
|
||||
template <class T>
|
||||
GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int> rowIndices,
|
||||
const GpuVector<int> columnIndices,
|
||||
GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int>& rowIndices,
|
||||
const GpuVector<int>& columnIndices,
|
||||
size_t blockSize)
|
||||
: m_nonZeroElements(columnIndices.dim() * blockSize * blockSize)
|
||||
, m_columnIndices(columnIndices)
|
||||
|
@ -77,8 +77,8 @@ public:
|
||||
//!
|
||||
//! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to
|
||||
//! restrictions in the current version of cusparse. This might change in future versions.
|
||||
GpuSparseMatrix(const GpuVector<int> rowIndices,
|
||||
const GpuVector<int> columnIndices,
|
||||
GpuSparseMatrix(const GpuVector<int>& rowIndices,
|
||||
const GpuVector<int>& columnIndices,
|
||||
size_t blockSize);
|
||||
|
||||
/**
|
||||
|
@ -86,10 +86,11 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
|
||||
}
|
||||
|
||||
if (m_storeFactorizationAsFloat){
|
||||
assert(m_splitMatrix && "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
|
||||
OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
|
||||
|
||||
// initialize mixed precision datastructures
|
||||
m_gpuMatrixReorderedLowerFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_)));
|
||||
m_gpuMatrixReorderedUpperFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_)));
|
||||
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
|
||||
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
|
||||
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
|
||||
}
|
||||
|
||||
@ -271,23 +272,42 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
||||
if (m_splitMatrix) {
|
||||
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data() : nullptr,
|
||||
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data() : nullptr,
|
||||
m_storeFactorizationAsFloat ? m_gpuMatrixReorderedDiagFloat.value().data() : nullptr,
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_storeFactorizationAsFloat,
|
||||
factorizationThreadBlockSize);
|
||||
if (m_storeFactorizationAsFloat){
|
||||
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedDiagFloat.value().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
factorizationThreadBlockSize);
|
||||
}
|
||||
else{
|
||||
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
|
||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||
m_gpuMatrixReorderedDiag.value().data(),
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
factorizationThreadBlockSize);
|
||||
}
|
||||
|
||||
} else {
|
||||
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||
|
@ -93,7 +93,7 @@ invBlockInPlace(T* __restrict__ block)
|
||||
T t14 = block[2] * block[6];
|
||||
|
||||
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
|
||||
T t17 = T(T(1.0)) / det;
|
||||
T t17 = T(1.0) / det;
|
||||
|
||||
T matrix01 = block[1];
|
||||
T matrix00 = block[0];
|
||||
|
@ -120,7 +120,7 @@ namespace
|
||||
}
|
||||
}
|
||||
|
||||
template <int blocksize, class InputScalar, class OutputScalar>
|
||||
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
|
||||
__global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
int* lowerRowIndices,
|
||||
int* lowerColIndices,
|
||||
@ -134,7 +134,6 @@ namespace
|
||||
int* reorderedToNatural,
|
||||
int* naturalToReordered,
|
||||
const int startIdx,
|
||||
const bool copyResultToOtherMatrix,
|
||||
int rowsInLevelSet)
|
||||
{
|
||||
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
@ -442,7 +441,7 @@ LUFactorization(T* srcMatrix,
|
||||
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
|
||||
}
|
||||
|
||||
template <int blocksize, class InputScalar, class OutputScalar>
|
||||
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
|
||||
void
|
||||
LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
int* lowerRowIndices,
|
||||
@ -458,13 +457,12 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
int* naturalToReordered,
|
||||
const int startIdx,
|
||||
int rowsInLevelSet,
|
||||
bool copyResultToOtherMatrix,
|
||||
int thrBlockSize)
|
||||
{
|
||||
int threadBlockSize
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar>, thrBlockSize);
|
||||
= ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
|
||||
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||
cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
|
||||
lowerRowIndices,
|
||||
lowerColIndices,
|
||||
reorderedUpperMat,
|
||||
@ -477,7 +475,6 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
reorderedToNatural,
|
||||
naturalToReordered,
|
||||
startIdx,
|
||||
copyResultToOtherMatrix,
|
||||
rowsInLevelSet);
|
||||
}
|
||||
|
||||
@ -485,12 +482,14 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
|
||||
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, bool, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, bool, int);
|
||||
// template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||
// template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
|
||||
template void LUFactorizationSplit<blocksize, T, float, true>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, true>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, float, false>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \
|
||||
template void LUFactorizationSplit<blocksize, T, double, false>( \
|
||||
T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
|
||||
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||
|
@ -176,7 +176,7 @@ void LUFactorization(T* reorderedMat,
|
||||
* function
|
||||
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||
*/
|
||||
template <int blocksize, class InputScalar, class OutputScalar>
|
||||
template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
|
||||
void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
int* lowerRowIndices,
|
||||
int* lowerColIndices,
|
||||
@ -191,7 +191,6 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
|
||||
int* naturalToReordered,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
bool copyResultToOtherMatrix,
|
||||
int threadBlockSize);
|
||||
|
||||
} // namespace Opm::gpuistl::detail::ILU0
|
||||
|
Loading…
Reference in New Issue
Block a user