use getCudaRecomendedMaxPotentialBlocksize

2025-02-25 18:55:30 -06:00 · 2024-06-18 11:34:31 +02:00
parent 9b2f41ad96
commit 2b9c81fe09
2 changed files with 27 additions and 8 deletions
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@@ -30,10 +30,6 @@
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <vector>
-#include <chrono>
-#include <iostream>
-#include <string>
-#include <map>

 namespace
 {
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
@@ -19,6 +19,7 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <stdexcept>
+
 namespace Opm::cuistl::detail
 {
 namespace
@@ -507,6 +508,20 @@ namespace
        const auto threads = getThreads(numberOfRows);
        return (numberOfRows + threads - 1) / threads;
    }
+
+    // Kernel here is the function object of the cuda kernel
+    template <class Kernel>
+    inline int getCudaRecomendedThreadBlockSize(Kernel k){
+        int blockSize;
+        int tmpGridSize;
+        cudaOccupancyMaxPotentialBlockSize( &tmpGridSize, &blockSize, k, 0, 0);
+        return blockSize;
+    }
+
+    inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize){
+        return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
+    }
+
 } // namespace

 template <class T, int blocksize>
@@ -551,7 +566,9 @@ computeLowerSolveLevelSetSplit(T* reorderedMat,
                          const T* d,
                          T* v)
 {
-    cuComputeLowerSolveLevelSetSplit<T, blocksize><<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(
+    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
+    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
@@ -581,7 +598,9 @@ computeUpperSolveLevelSetSplit(T* reorderedMat,
                          const T* dInv,
                          T* v)
 {
-    cuComputeUpperSolveLevelSetSplit<T, blocksize><<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(
+    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
+    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

@@ -627,8 +646,10 @@ computeDiluDiagonalSplit(T* reorderedLowerMat,
                    T* dInv)
 {
    if (blocksize <= 3) {
+        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
+        int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
        cuComputeDiluDiagonalSplit<T, blocksize>
-            <<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(reorderedLowerMat,
+            <<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
                    lowerRowIndices,
                    lowerColIndices,
                    reorderedUpperMat,
@@ -659,7 +680,9 @@ void
 copyMatDataToReorderedSplit(
    T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows)
 {
-    cuMoveDataToReorderedSplit<T, blocksize><<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
+    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
+    int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
+    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        srcMatrix, srcRowIndices, srcColumnIndices, dstLowerMatrix, dstLowerRowIndices, dstUpperMatrix, dstUpperRowIndices, dstDiag, naturalToReordered, numberOfRows);
 }