implement PR feedback

2025-02-25 18:55:30 -06:00 · 2024-10-02 14:57:34 +02:00
parent 54c2f2df30
commit e82a9fa56c
6 changed files with 58 additions and 40 deletions
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
@@ -82,8 +82,8 @@ GpuSparseMatrix<T>::GpuSparseMatrix(const T* nonZeroElements,
 }

 template <class T>
-GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int> rowIndices,
-                                  const GpuVector<int> columnIndices,
+GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int>& rowIndices,
+                                  const GpuVector<int>& columnIndices,
                                  size_t blockSize)
    : m_nonZeroElements(columnIndices.dim() * blockSize * blockSize)
    , m_columnIndices(columnIndices)
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
@@ -77,8 +77,8 @@ public:
    //!
    //! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to
    //!       restrictions in the current version of cusparse. This might change in future versions.
-    GpuSparseMatrix(const GpuVector<int> rowIndices,
-                   const GpuVector<int> columnIndices,
+    GpuSparseMatrix(const GpuVector<int>& rowIndices,
+                   const GpuVector<int>& columnIndices,
                   size_t blockSize);

    /**
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -86,10 +86,11 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
    }

    if (m_storeFactorizationAsFloat){
-        assert(m_splitMatrix && "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
+        OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
+
        // initialize mixed precision datastructures
-        m_gpuMatrixReorderedLowerFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_)));
-        m_gpuMatrixReorderedUpperFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_)));
+        m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
+        m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
        m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
    }

@@ -271,23 +272,42 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
        const int numOfRowsInLevel = m_levelSets[level].size();

        if (m_splitMatrix) {
-            detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float>(
-                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                m_gpuMatrixReorderedDiag.value().data(),
-                m_storeFactorizationAsFloat ? m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data() : nullptr,
-                m_storeFactorizationAsFloat ? m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data() : nullptr,
-                m_storeFactorizationAsFloat ? m_gpuMatrixReorderedDiagFloat.value().data() : nullptr,
-                m_gpuReorderToNatural.data(),
-                m_gpuNaturalToReorder.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_storeFactorizationAsFloat,
-                factorizationThreadBlockSize);
+            if (m_storeFactorizationAsFloat){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedDiagFloat.value().data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    factorizationThreadBlockSize);
+            }
+            else{
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    factorizationThreadBlockSize);
+            }

        } else {
            detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
--- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
@@ -93,7 +93,7 @@ invBlockInPlace(T* __restrict__ block)
        T t14 = block[2] * block[6];

        T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
-        T t17 = T(T(1.0)) / det;
+        T t17 = T(1.0) / det;

        T matrix01 = block[1];
        T matrix00 = block[0];
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -120,7 +120,7 @@ namespace
        }
    }

-    template <int blocksize, class InputScalar, class OutputScalar>
+    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
    __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                                           int* lowerRowIndices,
                                           int* lowerColIndices,
@@ -134,7 +134,6 @@ namespace
                                           int* reorderedToNatural,
                                           int* naturalToReordered,
                                           const int startIdx,
-                                           const bool copyResultToOtherMatrix,
                                           int rowsInLevelSet)
    {
        auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
@@ -442,7 +441,7 @@ LUFactorization(T* srcMatrix,
        srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }

-template <int blocksize, class InputScalar, class OutputScalar>
+template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
 void
 LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                     int* lowerRowIndices,
@@ -458,13 +457,12 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                     int* naturalToReordered,
                     const int startIdx,
                     int rowsInLevelSet,
-                     bool copyResultToOtherMatrix,
                     int thrBlockSize)
 {
    int threadBlockSize
-        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar>, thrBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                                                             lowerRowIndices,
                                                                             lowerColIndices,
                                                                             reorderedUpperMat,
@@ -477,7 +475,6 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                                                                             reorderedToNatural,
                                                                             naturalToReordered,
                                                                             startIdx,
-                                                                             copyResultToOtherMatrix,
                                                                             rowsInLevelSet);
 }

@@ -485,12 +482,14 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float>(                                                                  \
-        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, bool, int);                                          \
-    template void LUFactorizationSplit<blocksize, T, double>(                                                                  \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, bool, int);             
-    // template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
-    // template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
+    template void LUFactorizationSplit<blocksize, T, float, true>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                                          \
+    template void LUFactorizationSplit<blocksize, T, double, true>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
+    template void LUFactorizationSplit<blocksize, T, float, false>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                                          \
+    template void LUFactorizationSplit<blocksize, T, double, false>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);

 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@@ -176,7 +176,7 @@ void LUFactorization(T* reorderedMat,
 * function
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
-template <int blocksize, class InputScalar, class OutputScalar>
+template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
 void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                          int* lowerRowIndices,
                          int* lowerColIndices,
@@ -191,7 +191,6 @@ void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                          int* naturalToReordered,
                          int startIdx,
                          int rowsInLevelSet,
-                          bool copyResultToOtherMatrix,
                          int threadBlockSize);

 } // namespace Opm::gpuistl::detail::ILU0