make autotuner use lambda that only depends on blocksize

2025-02-25 18:55:30 -06:00 · 2024-08-20 15:06:59 +02:00
parent 14ea44246a
commit ae4e6a65fc
5 changed files with 287 additions and 236 deletions
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@@ -74,10 +74,12 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
+    }
+    else {
        m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
-    computeDiagAndMoveReorderedData();
+    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);

    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
@@ -96,69 +98,77 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_splitMatrix) {
-                detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    d.data(),
-                    v.data(),
-                    m_lowerSolveThreadBlockSize);
-            } else {
-                detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
-                    m_gpuMatrixReordered->getNonZeroValues().data(),
-                    m_gpuMatrixReordered->getRowIndices().data(),
-                    m_gpuMatrixReordered->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    d.data(),
-                    v.data(),
-                    m_lowerSolveThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
-        }
+        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+    }
+}

-        levelStartIdx = m_cpuMatrix.N();
-        //  upper triangular solve: (D + U_A) v = Dy
-        for (int level = m_levelSets.size() - 1; level >= 0; --level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            levelStartIdx -= numOfRowsInLevel;
-            if (m_splitMatrix) {
-                detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    v.data(),
-                    m_upperSolveThreadBlockSize);
-            } else {
-                detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
-                    m_gpuMatrixReordered->getNonZeroValues().data(),
-                    m_gpuMatrixReordered->getRowIndices().data(),
-                    m_gpuMatrixReordered->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    v.data(),
-                    m_upperSolveThreadBlockSize);
-            }
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
+{
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        if (m_splitMatrix) {
+            detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                d.data(),
+                v.data(),
+                lowerSolveThreadBlockSize);
+        } else {
+            detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
+                m_gpuMatrixReordered->getNonZeroValues().data(),
+                m_gpuMatrixReordered->getRowIndices().data(),
+                m_gpuMatrixReordered->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                d.data(),
+                v.data(),
+                lowerSolveThreadBlockSize);
+        }
+        levelStartIdx += numOfRowsInLevel;
+    }
+
+    levelStartIdx = m_cpuMatrix.N();
+    //  upper triangular solve: (D + U_A) v = Dy
+    for (int level = m_levelSets.size() - 1; level >= 0; --level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        levelStartIdx -= numOfRowsInLevel;
+        if (m_splitMatrix) {
+            detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                v.data(),
+                upperSolveThreadBlockSize);
+        } else {
+            detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
+                m_gpuMatrixReordered->getNonZeroValues().data(),
+                m_gpuMatrixReordered->getRowIndices().data(),
+                m_gpuMatrixReordered->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                v.data(),
+                upperSolveThreadBlockSize);
        }
    }
 }

+
 template <class M, class X, class Y, int l>
 void
 CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
@@ -178,72 +188,76 @@ CuDILU<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
-        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        computeDiagAndMoveReorderedData();
+        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
    }
 }

 template <class M, class X, class Y, int l>
 void
-CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
+CuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    OPM_TIMEBLOCK(prec_update);
-    {
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
+    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+}
+
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+{
+    if (m_splitMatrix) {
+        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
+            m_gpuMatrix.getNonZeroValues().data(),
+            m_gpuMatrix.getRowIndices().data(),
+            m_gpuMatrix.getColumnIndices().data(),
+            m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+            m_gpuMatrixReorderedLower->getRowIndices().data(),
+            m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+            m_gpuMatrixReorderedUpper->getRowIndices().data(),
+            m_gpuMatrixReorderedDiag->data(),
+            m_gpuNaturalToReorder.data(),
+            m_gpuMatrixReorderedLower->N(),
+            moveThreadBlockSize);
+    } else {
+        detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
+                                                                m_gpuMatrix.getRowIndices().data(),
+                                                                m_gpuMatrixReordered->getNonZeroValues().data(),
+                                                                m_gpuMatrixReordered->getRowIndices().data(),
+                                                                m_gpuNaturalToReorder.data(),
+                                                                m_gpuMatrixReordered->N(),
+                                                                moveThreadBlockSize);
+    }
+
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
-                m_gpuMatrix.getNonZeroValues().data(),
-                m_gpuMatrix.getRowIndices().data(),
-                m_gpuMatrix.getColumnIndices().data(),
+            detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                m_gpuMatrixReorderedDiag->data(),
+                m_gpuReorderToNatural.data(),
                m_gpuNaturalToReorder.data(),
-                m_gpuMatrixReorderedLower->N(),
-                m_moveThreadBlockSize);
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                factorizationBlockSize);
        } else {
-            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                   m_gpuMatrix.getRowIndices().data(),
-                                                                   m_gpuMatrixReordered->getNonZeroValues().data(),
-                                                                   m_gpuMatrixReordered->getRowIndices().data(),
-                                                                   m_gpuNaturalToReorder.data(),
-                                                                   m_gpuMatrixReordered->N(),
-                                                                   m_moveThreadBlockSize);
-        }
-
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_splitMatrix) {
-                detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuMatrixReorderedDiag->data(),
-                    m_gpuReorderToNatural.data(),
-                    m_gpuNaturalToReorder.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    m_DILUFactorizationThreadBlockSize);
-            } else {
-                detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
-                    m_gpuMatrixReordered->getNonZeroValues().data(),
-                    m_gpuMatrixReordered->getRowIndices().data(),
-                    m_gpuMatrixReordered->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    m_gpuNaturalToReorder.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuDInv.data(),
-                    m_DILUFactorizationThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
+            detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
+                m_gpuMatrixReordered->getNonZeroValues().data(),
+                m_gpuMatrixReordered->getRowIndices().data(),
+                m_gpuMatrixReordered->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                m_gpuNaturalToReorder.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuDInv.data(),
+                factorizationBlockSize);
        }
+        levelStartIdx += numOfRowsInLevel;
    }
 }

@@ -251,21 +265,31 @@ template <class M, class X, class Y, int l>
 void
 CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
 {
-    using CuDILUType = std::remove_reference_t<decltype(*this)>;
-
    // tune the thread-block size of the update function
-    auto updateFunc = std::bind(&CuDILUType::update, this);
-    detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
-    detail::tuneThreadBlockSize(updateFunc, m_DILUFactorizationThreadBlockSize);
+    auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
+        this->update(moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    };
+    m_moveThreadBlockSize = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate);
+
+    auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize){
+        this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
+    };
+    m_DILUFactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate);

    // tune the thread-block size of the apply
-    auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

-    detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
-    detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
+    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
+        this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    };
+    m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneLowerSolveThreadBlockSizeInApply);
+
+    auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize){
+        this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
+    };
+    m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply);
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/CuDILU.hpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.hpp
@@ -80,7 +80,7 @@ public:
    void update() final;

    //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData();
+    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@@ -100,6 +100,10 @@ public:


 private:
+    //! \brief Apply the preconditoner.
+    void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
+    //! \brief Updates the matrix data.
+    void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
    //! \brief Reference to the underlying matrix
    const M& m_cpuMatrix;
    //! \brief size_t describing the dimensions of the square block elements
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
@@ -79,7 +79,7 @@ OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
        m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
-    LUFactorizeAndMoveData();
+    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);

    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
@@ -98,59 +98,66 @@ OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_splitMatrix) {
-                detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    d.data(),
-                    v.data(),
-                    m_lowerSolveThreadBlockSize);
-            } else {
-                detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                         m_gpuReorderedLU->getRowIndices().data(),
-                                                                         m_gpuReorderedLU->getColumnIndices().data(),
-                                                                         m_gpuReorderToNatural.data(),
-                                                                         levelStartIdx,
-                                                                         numOfRowsInLevel,
-                                                                         d.data(),
-                                                                         v.data(),
-                                                                         m_lowerSolveThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
-        }
+        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+    }
+}

-        levelStartIdx = m_cpuMatrix.N();
-        for (int level = m_levelSets.size() - 1; level >= 0; --level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-            levelStartIdx -= numOfRowsInLevel;
-            if (m_splitMatrix) {
-                detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuReorderToNatural.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_gpuMatrixReorderedDiag.value().data(),
-                    v.data(),
-                    m_upperSolveThreadBlockSize);
-            } else {
-                detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                         m_gpuReorderedLU->getRowIndices().data(),
-                                                                         m_gpuReorderedLU->getColumnIndices().data(),
-                                                                         m_gpuReorderToNatural.data(),
-                                                                         levelStartIdx,
-                                                                         numOfRowsInLevel,
-                                                                         v.data(),
-                                                                         m_upperSolveThreadBlockSize);
-            }
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
+{
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        if (m_splitMatrix) {
+            detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                d.data(),
+                v.data(),
+                lowerSolveThreadBlockSize);
+        } else {
+            detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                        m_gpuReorderedLU->getRowIndices().data(),
+                                                                        m_gpuReorderedLU->getColumnIndices().data(),
+                                                                        m_gpuReorderToNatural.data(),
+                                                                        levelStartIdx,
+                                                                        numOfRowsInLevel,
+                                                                        d.data(),
+                                                                        v.data(),
+                                                                        lowerSolveThreadBlockSize);
+        }
+        levelStartIdx += numOfRowsInLevel;
+    }
+
+    levelStartIdx = m_cpuMatrix.N();
+    for (int level = m_levelSets.size() - 1; level >= 0; --level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        levelStartIdx -= numOfRowsInLevel;
+        if (m_splitMatrix) {
+            detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
+                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                m_gpuReorderToNatural.data(),
+                levelStartIdx,
+                numOfRowsInLevel,
+                m_gpuMatrixReorderedDiag.value().data(),
+                v.data(),
+                upperSolveThreadBlockSize);
+        } else {
+            detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                        m_gpuReorderedLU->getRowIndices().data(),
+                                                                        m_gpuReorderedLU->getColumnIndices().data(),
+                                                                        m_gpuReorderToNatural.data(),
+                                                                        levelStartIdx,
+                                                                        numOfRowsInLevel,
+                                                                        v.data(),
+                                                                        upperSolveThreadBlockSize);
        }
    }
 }
@@ -174,70 +181,76 @@ OpmCuILU0<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
-        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        LUFactorizeAndMoveData();
+        update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
    }
 }

 template <class M, class X, class Y, int l>
 void
-OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
+OpmCuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadBlockSize)
 {
    OPM_TIMEBLOCK(prec_update);
    {
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
+        LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
+    }
+}
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
+{
+    if (m_splitMatrix) {
+        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
+            m_gpuMatrix.getNonZeroValues().data(),
+            m_gpuMatrix.getRowIndices().data(),
+            m_gpuMatrix.getColumnIndices().data(),
+            m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+            m_gpuMatrixReorderedLower->getRowIndices().data(),
+            m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+            m_gpuMatrixReorderedUpper->getRowIndices().data(),
+            m_gpuMatrixReorderedDiag.value().data(),
+            m_gpuNaturalToReorder.data(),
+            m_gpuMatrixReorderedLower->N(),
+            moveThreadBlockSize);
+    } else {
+        detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
+                                                                m_gpuMatrix.getRowIndices().data(),
+                                                                m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                m_gpuReorderedLU->getRowIndices().data(),
+                                                                m_gpuNaturalToReorder.data(),
+                                                                m_gpuReorderedLU->N(),
+                                                                moveThreadBlockSize);
+    }
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+
        if (m_splitMatrix) {
-            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
-                m_gpuMatrix.getNonZeroValues().data(),
-                m_gpuMatrix.getRowIndices().data(),
-                m_gpuMatrix.getColumnIndices().data(),
+            detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedLower->getColumnIndices().data(),
                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                m_gpuMatrixReorderedDiag.value().data(),
+                m_gpuReorderToNatural.data(),
                m_gpuNaturalToReorder.data(),
-                m_gpuMatrixReorderedLower->N(),
-                m_moveThreadBlockSize);
+                levelStartIdx,
+                numOfRowsInLevel,
+                factorizationThreadBlockSize);
+
        } else {
-            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                   m_gpuMatrix.getRowIndices().data(),
-                                                                   m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                   m_gpuReorderedLU->getRowIndices().data(),
-                                                                   m_gpuNaturalToReorder.data(),
-                                                                   m_gpuReorderedLU->N(),
-                                                                   m_moveThreadBlockSize);
-        }
-        int levelStartIdx = 0;
-        for (int level = 0; level < m_levelSets.size(); ++level) {
-            const int numOfRowsInLevel = m_levelSets[level].size();
-
-            if (m_splitMatrix) {
-                detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
-                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedLower->getRowIndices().data(),
-                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                    m_gpuMatrixReorderedDiag.value().data(),
-                    m_gpuReorderToNatural.data(),
-                    m_gpuNaturalToReorder.data(),
-                    levelStartIdx,
-                    numOfRowsInLevel,
-                    m_ILU0FactorizationThreadBlockSize);
-
-            } else {
-                detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
-                                                                      m_gpuReorderedLU->getRowIndices().data(),
-                                                                      m_gpuReorderedLU->getColumnIndices().data(),
-                                                                      m_gpuNaturalToReorder.data(),
-                                                                      m_gpuReorderToNatural.data(),
-                                                                      numOfRowsInLevel,
-                                                                      levelStartIdx,
-                                                                      m_ILU0FactorizationThreadBlockSize);
-            }
-            levelStartIdx += numOfRowsInLevel;
+            detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                    m_gpuReorderedLU->getRowIndices().data(),
+                                                                    m_gpuReorderedLU->getColumnIndices().data(),
+                                                                    m_gpuNaturalToReorder.data(),
+                                                                    m_gpuReorderToNatural.data(),
+                                                                    numOfRowsInLevel,
+                                                                    levelStartIdx,
+                                                                    factorizationThreadBlockSize);
        }
+        levelStartIdx += numOfRowsInLevel;
    }
 }

@@ -245,21 +258,31 @@ template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
 {
-    using CuDILUType = std::remove_reference_t<decltype(*this)>;
-
    // tune the thread-block size of the update function
-    auto updateFunc = std::bind(&CuDILUType::update, this);
-    detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
-    detail::tuneThreadBlockSize(updateFunc, m_ILU0FactorizationThreadBlockSize);
+    auto tuneMoveThreadBlockSizeInUpdate = [this](int moveThreadBlockSize){
+        this->update(moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    };
+    m_moveThreadBlockSize = detail::tuneThreadBlockSize(tuneMoveThreadBlockSizeInUpdate);
+
+    auto tuneFactorizationThreadBlockSizeInUpdate = [this](int factorizationThreadBlockSize){
+        this->update(m_moveThreadBlockSize, factorizationThreadBlockSize);
+    };
+    m_ILU0FactorizationThreadBlockSize = detail::tuneThreadBlockSize(tuneFactorizationThreadBlockSizeInUpdate);

    // tune the thread-block size of the apply
-    auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

-    detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
-    detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
+    auto tuneLowerSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int lowerSolveThreadBlockSize){
+        this->apply(tmpV, tmpD, lowerSolveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    };
+    m_lowerSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneLowerSolveThreadBlockSizeInApply);
+
+    auto tuneUpperSolveThreadBlockSizeInApply = [this, &tmpV, &tmpD](int upperSolveThreadBlockSize){
+        this->apply(tmpV, tmpD, m_moveThreadBlockSize, upperSolveThreadBlockSize);
+    };
+    m_upperSolveThreadBlockSize = detail::tuneThreadBlockSize(tuneUpperSolveThreadBlockSizeInApply);
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
@@ -82,7 +82,7 @@ public:
    void update() final;

    //! \brief Compute LU factorization, and update the data of the reordered matrix
-    void LUFactorizeAndMoveData();
+    void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@@ -101,6 +101,10 @@ public:


 private:
+    //! \brief Apply the preconditoner.
+    void apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize);
+    //! \brief Updates the matrix data.
+    void update(int moveThreadBlockSize, int factorizationThreadBlockSize);
    //! \brief Reference to the underlying matrix
    const M& m_cpuMatrix;
    //! \brief size_t describing the dimensions of the square block elements
--- a/opm/simulators/linalg/cuistl/detail/autotuner.hpp
+++ b/opm/simulators/linalg/cuistl/detail/autotuner.hpp
@@ -29,11 +29,9 @@ namespace Opm::cuistl::detail
    /// @brief Function that tests the best thread block size, assumes updating the reference will affect runtimes
    /// @tparam func function to tune
    /// @tparam ...Args types of the arguments needed to call the function
-    /// @param f the function to tune
-    /// @param threadBlockSize reference to the thread block size that will affect kernel executions
-    /// @param ...args arguments needed by the function
+    /// @param f the function to tune, which takes the thread block size as the input
    template <typename func, typename... Args>
-    void tuneThreadBlockSize(func f, int& threadBlockSize, Args&&... args) {
+    int tuneThreadBlockSize(func& f) {

        // TODO: figure out a more rigorous way of deciding how many runs will suffice?
        constexpr const int runs = 2;
@@ -51,13 +49,11 @@ namespace Opm::cuistl::detail

        // try each possible blocksize
        for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
-            // update the blocksize
-            threadBlockSize = thrBlockSize;

            // record a first event, and then an event after each kernel
            OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
            for (int i = 0; i < runs; ++i){
-                f(std::forward<Args>(args)...); // runs an arbitrary function with the provided arguments
+                f(thrBlockSize); // runs an arbitrary function with the provided arguments
                OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i+1]));
            }

@@ -80,7 +76,7 @@ namespace Opm::cuistl::detail
        // TODO: have this only be printed if linear solver verbosity >0?
        printf("best size: %d, best time %f\n", bestBlockSize, bestTime);

-        threadBlockSize = bestBlockSize;
+        return bestBlockSize;
    }

 } // end namespace Opm::cuistl::detail