Merge pull request #5451 from multitalentloes/generalize_thread_block_tuner

Generalize thread block tuner
2025-02-25 18:55:30 -06:00 · 2024-08-22 12:56:14 +02:00 · 2024-08-22 12:56:14 +02:00 · c4f686227b
commit c4f686227b
parent 9b4c51de7e 8a1dfceeed
6 changed files with 398 additions and 306 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -224,6 +224,7 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)

  # HEADERS
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/autotuner.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@ -25,6 +25,7 @@
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
+#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
@ -33,6 +34,9 @@
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <tuple>
+#include <functional>
+#include <utility>
+#include <string>
 namespace Opm::cuistl
 {

@ -71,18 +75,16 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
+    }
+    else {
        m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
-    computeDiagAndMoveReorderedData();
+    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);

-    // HIP does currently not support automtically picking thread block sizes as well as CUDA
-    // So only when tuning and using hip should we do our own manual tuning
-#ifdef USE_HIP
    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
    }
-#endif
 }

 template <class M, class X, class Y, int l>
@ -97,69 +99,77 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_splitMatrix) {
-                detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    d.data(),
-                    v.data(),
-                    m_applyThreadBlockSize);
-            } else {
-                detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
-                    m_gpuMatrixReordered->getNonZeroValues().data(),
-                    m_gpuMatrixReordered->getRowIndices().data(),
-                    m_gpuMatrixReordered->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    d.data(),
-                    v.data(),
-                    m_applyThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
-        }
+        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+    }
+}

-        levelStartIdx = m_cpuMatrix.N();
-        //  upper triangular solve: (D + U_A) v = Dy
-        for (int level = m_levelSets.size() - 1; level >= 0; --level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            levelStartIdx -= numOfRowsInLevel;
-            if (m_splitMatrix) {
-                detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    v.data(),
-                    m_applyThreadBlockSize);
-            } else {
-                detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
-                    m_gpuMatrixReordered->getNonZeroValues().data(),
-                    m_gpuMatrixReordered->getRowIndices().data(),
-                    m_gpuMatrixReordered->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    v.data(),
-                    m_applyThreadBlockSize);
-            }
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
+{
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        if (m_splitMatrix) {
+            detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                d.data(),
+                v.data(),
+                lowerSolveThreadBlockSize);
+        } else {
+            detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
+                m_gpuMatrixReordered->getNonZeroValues().data(),
+                m_gpuMatrixReordered->getRowIndices().data(),
+                m_gpuMatrixReordered->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                d.data(),
+                v.data(),
+                lowerSolveThreadBlockSize);
+        }
+        levelStartIdx += numOfRowsInLevel;
+    }
+
+    levelStartIdx = m_cpuMatrix.N();
+    //  upper triangular solve: (D + U_A) v = Dy
+    for (int level = m_levelSets.size() - 1; level >= 0; --level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        levelStartIdx -= numOfRowsInLevel;
+        if (m_splitMatrix) {
+            detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                v.data(),
+                upperSolveThreadBlockSize);
+        } else {
+            detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
+                m_gpuMatrixReordered->getNonZeroValues().data(),
+                m_gpuMatrixReordered->getRowIndices().data(),
+                m_gpuMatrixReordered->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                v.data(),
+                upperSolveThreadBlockSize);
        }
    }
 }

+
 template <class M, class X, class Y, int l>
 void
 CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
@ -179,72 +189,76 @@ CuDILU<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
-        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        computeDiagAndMoveReorderedData();
+        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
    }
 }

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
+CuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    OPM_TIMEBLOCK(prec_update);
-    {
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
+    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+}
+
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+{
+    if (m_splitMatrix) {
+        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
+            m_gpuMatrix.getNonZeroValues().data(),
+            m_gpuMatrix.getRowIndices().data(),
+            m_gpuMatrix.getColumnIndices().data(),
+            m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+            m_gpuMatrixReorderedLower->getRowIndices().data(),
+            m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+            m_gpuMatrixReorderedUpper->getRowIndices().data(),
+            m_gpuMatrixReorderedDiag->data(),
+            m_gpuNaturalToReorder.data(),
+            m_gpuMatrixReorderedLower->N(),
+            moveThreadBlockSize);
+    } else {
+        detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
+                                                                m_gpuMatrix.getRowIndices().data(),
+                                                                m_gpuMatrixReordered->getNonZeroValues().data(),
+                                                                m_gpuMatrixReordered->getRowIndices().data(),
+                                                                m_gpuNaturalToReorder.data(),
+                                                                m_gpuMatrixReordered->N(),
+                                                                moveThreadBlockSize);
+    }
+
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
-                m_gpuMatrix.getNonZeroValues().data(),
-                m_gpuMatrix.getRowIndices().data(),
-                m_gpuMatrix.getColumnIndices().data(),
+            detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                m_gpuMatrixReorderedDiag->data(),
+                m_gpuReorderToNatural.data(),
                m_gpuNaturalToReorder.data(),
-                m_gpuMatrixReorderedLower->N(),
-                m_updateThreadBlockSize);
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                factorizationBlockSize);
        } else {
-            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                   m_gpuMatrix.getRowIndices().data(),
-                                                                   m_gpuMatrixReordered->getNonZeroValues().data(),
-                                                                   m_gpuMatrixReordered->getRowIndices().data(),
-                                                                   m_gpuNaturalToReorder.data(),
-                                                                   m_gpuMatrixReordered->N(),
-                                                                   m_updateThreadBlockSize);
-        }
-
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_splitMatrix) {
-                detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuMatrixReorderedDiag->data(),
-                    m_gpuReorderToNatural.data(),
-                    m_gpuNaturalToReorder.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    m_updateThreadBlockSize);
-            } else {
-                detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
-                    m_gpuMatrixReordered->getNonZeroValues().data(),
-                    m_gpuMatrixReordered->getRowIndices().data(),
-                    m_gpuMatrixReordered->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    m_gpuNaturalToReorder.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    m_updateThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
+            detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
+                m_gpuMatrixReordered->getNonZeroValues().data(),
+                m_gpuMatrixReordered->getRowIndices().data(),
+                m_gpuMatrixReordered->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                m_gpuNaturalToReorder.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                factorizationBlockSize);
        }
+        levelStartIdx += numOfRowsInLevel;
    }
 }

@ -252,54 +266,31 @@ template <class M, class X, class Y, int l>
 void
 CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
 {
-    // TODO: generalize this code and put it somewhere outside of this class
-    long long bestApplyTime = std::numeric_limits<long long>::max();
-    long long bestUpdateTime = std::numeric_limits<long long>::max();
-    int bestApplyBlockSize = -1;
-    int bestUpdateBlockSize = -1;
-    int interval = 64;
+    // tune the thread-block size of the update function
+    auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
+        this->update(moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    };
+    m_moveThreadBlockSize = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate, "Kernel moving data to reordered matrix");

-    // temporary buffers for the apply
+    auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize){
+        this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
+    };
+    m_DILUFactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing DILU factorization");
+
+    // tune the thread-block size of the apply
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

-    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
-        // sometimes the first kernel launch kan be slower, so take the time twice
-        for (int i = 0; i < 2; ++i) {
+    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
+        this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    };
+    m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneLowerSolveThreadBlockSizeInApply, "Kernel computing a lower triangular solve for a level set");

-            auto beforeUpdate = std::chrono::high_resolution_clock::now();
-            m_updateThreadBlockSize = thrBlockSize;
-            update();
-            std::ignore = cudaDeviceSynchronize();
-            auto afterUpdate = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
-                long long durationInMicroSec
-                    = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
-                if (durationInMicroSec < bestUpdateTime) {
-                    bestUpdateTime = durationInMicroSec;
-                    bestUpdateBlockSize = thrBlockSize;
-                }
-            }
-
-            auto beforeApply = std::chrono::high_resolution_clock::now();
-            m_applyThreadBlockSize = thrBlockSize;
-            apply(tmpV, tmpD);
-            std::ignore = cudaDeviceSynchronize();
-            auto afterApply = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
-                long long durationInMicroSec
-                    = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
-                if (durationInMicroSec < bestApplyTime) {
-                    bestApplyTime = durationInMicroSec;
-                    bestApplyBlockSize = thrBlockSize;
-                }
-            }
-        }
-    }
-
-    m_applyThreadBlockSize = bestApplyBlockSize;
-    m_updateThreadBlockSize = bestUpdateBlockSize;
+    auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize){
+        this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
+    };
+    m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply, "Kernel computing an upper triangular solve for a level set");
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/CuDILU.hpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.hpp
@ -80,7 +80,7 @@ public:
    void update() final;

    //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData();
+    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@ -104,6 +104,10 @@ public:


 private:
+    //! \brief Apply the preconditoner.
+    void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
+    //! \brief Updates the matrix data.
+    void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
    //! \brief Reference to the underlying matrix
    const M& m_cpuMatrix;
    //! \brief size_t describing the dimensions of the square block elements
@ -135,8 +139,10 @@ private:
    bool m_tuneThreadBlockSizes;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
-    int m_applyThreadBlockSize = -1;
-    int m_updateThreadBlockSize = -1;
+    int m_upperSolveThreadBlockSize = -1;
+    int m_lowerSolveThreadBlockSize = -1;
+    int m_moveThreadBlockSize = -1;
+    int m_DILUFactorizationThreadBlockSize = -1;
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
@ -21,6 +21,7 @@
 #include <dune/common/fmatrix.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
+#include <functional>
 #include <limits>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/TimingMacros.hpp>
@ -28,11 +29,14 @@
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
+#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
 #include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
+#include <string>
 #include <tuple>
+#include <utility>
 namespace Opm::cuistl
 {

@ -76,13 +80,11 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
        m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
-    LUFactorizeAndMoveData();
+    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);

-#ifdef USE_HIP
    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
    }
-#endif
 }

 template <class M, class X, class Y, int l>
@ -97,59 +99,66 @@ OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_splitMatrix) {
-                detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    d.data(),
-                    v.data(),
-                    m_applyThreadBlockSize);
-            } else {
-                detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                         m_gpuReorderedLU->getRowIndices().data(),
-                                                                         m_gpuReorderedLU->getColumnIndices().data(),
-                                                                         m_gpuReorderToNatural.data(),
-                                                                         levelStartIdx,
-                                                                         numOfRowsInLevel,
-                                                                         d.data(),
-                                                                         v.data(),
-                                                                         m_applyThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
-        }
+        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+    }
+}

-        levelStartIdx = m_cpuMatrix.N();
-        for (int level = m_levelSets.size() - 1; level >= 0; --level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            levelStartIdx -= numOfRowsInLevel;
-            if (m_splitMatrix) {
-                detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuMatrixReorderedDiag.value().data(),
-                    v.data(),
-                    m_applyThreadBlockSize);
-            } else {
-                detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                         m_gpuReorderedLU->getRowIndices().data(),
-                                                                         m_gpuReorderedLU->getColumnIndices().data(),
-                                                                         m_gpuReorderToNatural.data(),
-                                                                         levelStartIdx,
-                                                                         numOfRowsInLevel,
-                                                                         v.data(),
-                                                                         m_applyThreadBlockSize);
-            }
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
+{
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        if (m_splitMatrix) {
+            detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                d.data(),
+                v.data(),
+                lowerSolveThreadBlockSize);
+        } else {
+            detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                     m_gpuReorderedLU->getRowIndices().data(),
+                                                                     m_gpuReorderedLU->getColumnIndices().data(),
+                                                                     m_gpuReorderToNatural.data(),
+                                                                     levelStartIdx,
+                                                                     numOfRowsInLevel,
+                                                                     d.data(),
+                                                                     v.data(),
+                                                                     lowerSolveThreadBlockSize);
+        }
+        levelStartIdx += numOfRowsInLevel;
+    }
+
+    levelStartIdx = m_cpuMatrix.N();
+    for (int level = m_levelSets.size() - 1; level >= 0; --level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        levelStartIdx -= numOfRowsInLevel;
+        if (m_splitMatrix) {
+            detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuMatrixReorderedDiag.value().data(),
+                v.data(),
+                upperSolveThreadBlockSize);
+        } else {
+            detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                     m_gpuReorderedLU->getRowIndices().data(),
+                                                                     m_gpuReorderedLU->getColumnIndices().data(),
+                                                                     m_gpuReorderToNatural.data(),
+                                                                     levelStartIdx,
+                                                                     numOfRowsInLevel,
+                                                                     v.data(),
+                                                                     upperSolveThreadBlockSize);
        }
    }
 }
@ -173,70 +182,76 @@ OpmCuILU0<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
-        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        LUFactorizeAndMoveData();
+        update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
    }
 }

 template <class M, class X, class Y, int l>
 void
-OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
+OpmCuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadBlockSize)
 {
    OPM_TIMEBLOCK(prec_update);
    {
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
+        LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
+    }
+}
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
+{
+    if (m_splitMatrix) {
+        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
+            m_gpuMatrix.getNonZeroValues().data(),
+            m_gpuMatrix.getRowIndices().data(),
+            m_gpuMatrix.getColumnIndices().data(),
+            m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+            m_gpuMatrixReorderedLower->getRowIndices().data(),
+            m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+            m_gpuMatrixReorderedUpper->getRowIndices().data(),
+            m_gpuMatrixReorderedDiag.value().data(),
+            m_gpuNaturalToReorder.data(),
+            m_gpuMatrixReorderedLower->N(),
+            moveThreadBlockSize);
+    } else {
+        detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
+                                                               m_gpuMatrix.getRowIndices().data(),
+                                                               m_gpuReorderedLU->getNonZeroValues().data(),
+                                                               m_gpuReorderedLU->getRowIndices().data(),
+                                                               m_gpuNaturalToReorder.data(),
+                                                               m_gpuReorderedLU->N(),
+                                                               moveThreadBlockSize);
+    }
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+
        if (m_splitMatrix) {
-            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
-                m_gpuMatrix.getNonZeroValues().data(),
-                m_gpuMatrix.getRowIndices().data(),
-                m_gpuMatrix.getColumnIndices().data(),
+            detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                m_gpuMatrixReorderedDiag.value().data(),
+                m_gpuReorderToNatural.data(),
                m_gpuNaturalToReorder.data(),
-                m_gpuMatrixReorderedLower->N(),
-                m_updateThreadBlockSize);
+                levelStartIdx,
+                numOfRowsInLevel,
+                factorizationThreadBlockSize);
+
        } else {
-            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                   m_gpuMatrix.getRowIndices().data(),
-                                                                   m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                   m_gpuReorderedLU->getRowIndices().data(),
-                                                                   m_gpuNaturalToReorder.data(),
-                                                                   m_gpuReorderedLU->N(),
-                                                                   m_updateThreadBlockSize);
-        }
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-
-            if (m_splitMatrix) {
-                detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuMatrixReorderedDiag.value().data(),
-                    m_gpuReorderToNatural.data(),
-                    m_gpuNaturalToReorder.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_updateThreadBlockSize);
-
-            } else {
-                detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                      m_gpuReorderedLU->getRowIndices().data(),
-                                                                      m_gpuReorderedLU->getColumnIndices().data(),
-                                                                      m_gpuNaturalToReorder.data(),
-                                                                      m_gpuReorderToNatural.data(),
-                                                                      numOfRowsInLevel,
-                                                                      levelStartIdx,
-                                                                      m_updateThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
+            detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                  m_gpuReorderedLU->getRowIndices().data(),
+                                                                  m_gpuReorderedLU->getColumnIndices().data(),
+                                                                  m_gpuNaturalToReorder.data(),
+                                                                  m_gpuReorderToNatural.data(),
+                                                                  numOfRowsInLevel,
+                                                                  levelStartIdx,
+                                                                  factorizationThreadBlockSize);
        }
+        levelStartIdx += numOfRowsInLevel;
    }
 }

@ -244,54 +259,34 @@ template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
 {
-    // TODO generalize this tuning process in a function separate of the class
-    long long bestApplyTime = std::numeric_limits<long long>::max();
-    long long bestUpdateTime = std::numeric_limits<long long>::max();
-    int bestApplyBlockSize = -1;
-    int bestUpdateBlockSize = -1;
-    int interval = 64;
+    // tune the thread-block size of the update function
+    auto tuneMoveThreadBlockSizeInUpdate
+        = [this](int moveThreadBlockSize) { this->update(moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); };
+    m_moveThreadBlockSize
+        = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate, "Kernel moving data to reordered matrix");

-    // temporary buffers for the apply
+    auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize) {
+        this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
+    };
+    m_ILU0FactorizationThreadBlockSize
+        = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate, "Kernel computing ILU0 factorization");
+
+    // tune the thread-block size of the apply
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

-    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
-        // sometimes the first kernel launch kan be slower, so take the time twice
-        for (int i = 0; i < 2; ++i) {
+    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize) {
+        this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    };
+    m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(
+        tuneLowerSolveThreadBlockSizeInApply, "Kernel computing a lower triangular solve for a level set");

-            auto beforeUpdate = std::chrono::high_resolution_clock::now();
-            m_updateThreadBlockSize = thrBlockSize;
-            update();
-            std::ignore = cudaDeviceSynchronize();
-            auto afterUpdate = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
-                long long durationInMicroSec
-                    = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
-                if (durationInMicroSec < bestUpdateTime) {
-                    bestUpdateTime = durationInMicroSec;
-                    bestUpdateBlockSize = thrBlockSize;
-                }
-            }
-
-            auto beforeApply = std::chrono::high_resolution_clock::now();
-            m_applyThreadBlockSize = thrBlockSize;
-            apply(tmpV, tmpD);
-            std::ignore = cudaDeviceSynchronize();
-            auto afterApply = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
-                long long durationInMicroSec
-                    = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
-                if (durationInMicroSec < bestApplyTime) {
-                    bestApplyTime = durationInMicroSec;
-                    bestApplyBlockSize = thrBlockSize;
-                }
-            }
-        }
-    }
-
-    m_applyThreadBlockSize = bestApplyBlockSize;
-    m_updateThreadBlockSize = bestUpdateBlockSize;
+    auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize) {
+        this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
+    };
+    m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(
+        tuneUpperSolveThreadBlockSizeInApply, "Kernel computing an upper triangular solve for a level set");
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
@ -82,7 +82,7 @@ public:
    void update() final;

    //! \brief Compute LU factorization, and update the data of the reordered matrix
-    void LUFactorizeAndMoveData();
+    void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@ -105,6 +105,10 @@ public:


 private:
+    //! \brief Apply the preconditoner.
+    void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
+    //! \brief Updates the matrix data.
+    void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
    //! \brief Reference to the underlying matrix
    const M& m_cpuMatrix;
    //! \brief size_t describing the dimensions of the square block elements
@ -135,8 +139,10 @@ private:
    bool m_tuneThreadBlockSizes;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
-    int m_applyThreadBlockSize = -1;
-    int m_updateThreadBlockSize = -1;
+    int m_upperSolveThreadBlockSize = -1;
+    int m_lowerSolveThreadBlockSize = -1;
+    int m_moveThreadBlockSize = -1;
+    int m_ILU0FactorizationThreadBlockSize = -1;
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/detail/autotuner.hpp
+++ b/opm/simulators/linalg/cuistl/detail/autotuner.hpp
@ -0,0 +1,93 @@
+/*
+  Copyright 2024 SINTEF AS
+  This file is part of the Open Porous Media project (OPM).
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_AUTOTUNER_HPP
+#define OPM_AUTOTUNER_HPP
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <functional>
+#include <limits>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <string>
+#include <utility>
+
+namespace Opm::cuistl::detail
+{
+
+/// @brief Function that tests the best thread block size, assumes the provided function depends on threadblock-size
+/// @tparam The type of the function to tune
+/// @param f the function to tune, which takes the thread block size as the input
+template <typename func>
+int
+tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
+{
+    // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
+    // 64 is chosen so it is a multiple of the AMD wavefront size.
+    // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
+    // We time the kernel with each possible threadblock-size, and return the one
+    // that gave the fastest invidivual run.
+
+    // TODO: figure out a more rigorous way of deciding how many runs will suffice?
+    constexpr const int runs = 2;
+    cudaEvent_t events[runs + 1];
+
+    // create the events
+    for (int i = 0; i < runs + 1; ++i) {
+        OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
+    }
+
+    // Initialize helper variables
+    float bestTime = std::numeric_limits<float>::max();
+    int bestBlockSize = -1;
+    int interval = 64;
+
+    // try each possible blocksize
+    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
+
+        // record a first event, and then an event after each kernel
+        OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
+        for (int i = 0; i < runs; ++i) {
+            f(thrBlockSize); // runs an arbitrary function with the provided arguments
+            OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i + 1]));
+        }
+
+        // make suret he runs are over
+        OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));
+
+        // kernel launch was valid
+        if (cudaSuccess == cudaGetLastError()) {
+            // check if we beat the record for the fastest kernel
+            for (int i = 0; i < runs; ++i) {
+                float candidateBlockSizeTime;
+                OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
+                if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
+                    bestTime = candidateBlockSizeTime;
+                    bestBlockSize = thrBlockSize;
+                }
+            }
+        }
+    }
+
+    OpmLog::info(
+        fmt::format("{}: Tuned Blocksize: {} (fastest runtime: {}).", descriptionOfFunction, bestBlockSize, bestTime));
+
+    return bestBlockSize;
+}
+
+} // end namespace Opm::cuistl::detail
+
+#endif