Generalize block size tuner

2024-07-07 04:53:03 -05:00 · 2024-06-18 15:57:43 +02:00 · 2024-06-18 15:57:43 +02:00 · 11cd20beac
commit 11cd20beac
parent a41dfc5a6e
8 changed files with 195 additions and 107 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -217,6 +217,7 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)

  # HEADERS
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/autotuner.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@ -22,6 +22,7 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
 #include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
@ -35,6 +36,8 @@
 #include <config.h>
 #include <chrono>
 #include <tuple>
+#include <functional>
+#include <utility>
 namespace Opm::cuistl
 {

@ -80,6 +83,9 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)

    // HIP does currently not support automtically picking thread block sizes as well as CUDA
    // So only when tuning and using hip should we do our own manual tuning
+    if (m_tuneThreadBlockSizes){
+        tuneThreadBlockSizes();
+    }
 #ifdef USE_HIP
    if (m_tuneThreadBlockSizes){
        tuneThreadBlockSizes();
@ -99,6 +105,9 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
+        // cudaDeviceSynchronize();
+        // auto start = std::chrono::high_resolution_clock::now();
+
        int levelStartIdx = 0;
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
@ -113,7 +122,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    m_gpuDInv.data(),
                    d.data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_lowerSolveThreadBlockSize);
            } else {
                detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
@ -125,7 +134,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    m_gpuDInv.data(),
                    d.data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_lowerSolveThreadBlockSize);
            }
            levelStartIdx += numOfRowsInLevel;
        }
@ -145,7 +154,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_upperSolveThreadBlockSize);
            } else {
                detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
@ -156,9 +165,13 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_upperSolveThreadBlockSize);
            }
        }
+        // cudaDeviceSynchronize();
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        // printf("Apply duration %ldus\n", duration);
    }

 }
@ -182,8 +195,15 @@ CuDILU<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
+        // cudaDeviceSynchronize();
+        // auto start = std::chrono::high_resolution_clock::now();
        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
        computeDiagAndMoveReorderedData();
+
+        // cudaDeviceSynchronize();
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        // printf("Update duration %ldus\n", duration);
    }
 }

@ -205,7 +225,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
                m_gpuMatrixReorderedDiag->data(),
                m_gpuNaturalToReorder.data(),
                m_gpuMatrixReorderedLower->N(),
-                m_updateThreadBlockSize);
+                m_moveThreadBlockSize);
        } else {
            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                   m_gpuMatrix.getRowIndices().data(),
@ -213,7 +233,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
                                                                   m_gpuMatrixReordered->getRowIndices().data(),
                                                                   m_gpuNaturalToReorder.data(),
                                                                   m_gpuMatrixReordered->N(),
-                                                                   m_updateThreadBlockSize);
+                                                                   m_moveThreadBlockSize);
        }

        int levelStartIdx = 0;
@ -233,7 +253,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
                    levelStartIdx,
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
-                    m_updateThreadBlockSize);
+                    m_DILUFactorizationThreadBlockSize);
            } else {
                detail::DILU::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
                                                                    m_gpuMatrixReordered->getRowIndices().data(),
@ -243,7 +263,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
                                                                    levelStartIdx,
                                                                    numOfRowsInLevel,
                                                                    m_gpuDInv.data(),
-                                                                    m_updateThreadBlockSize);
+                                                                    m_DILUFactorizationThreadBlockSize);
            }
            levelStartIdx += numOfRowsInLevel;
        }
@ -254,52 +274,19 @@ template <class M, class X, class Y, int l>
 void
 CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
 {
-    // TODO: generalize this code and put it somewhere outside of this class
-    long long bestApplyTime = __LONG_LONG_MAX__;
-    long long bestUpdateTime = __LONG_LONG_MAX__;
-    int bestApplyBlockSize = -1;
-    int bestUpdateBlockSize = -1;
-    int interval = 64;

-    //temporary buffers for the apply
+    using CuDILUType = std::remove_reference_t<decltype(*this)>;
+    auto updateFunc = std::bind(&CuDILUType::update, this);
+    auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
+
+    detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
+    detail::tuneThreadBlockSize(updateFunc, m_DILUFactorizationThreadBlockSize);
+
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;
-
-    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
-        // sometimes the first kernel launch kan be slower, so take the time twice
-        for (int i = 0; i < 2; ++i){
-
-            auto beforeUpdate = std::chrono::high_resolution_clock::now();
-            m_updateThreadBlockSize = thrBlockSize;
-            update();
-            std::ignore = cudaDeviceSynchronize();
-            auto afterUpdate = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
-                if (durationInMicroSec < bestUpdateTime){
-                    bestUpdateTime = durationInMicroSec;
-                    bestUpdateBlockSize = thrBlockSize;
-                }
-            }
-
-            auto beforeApply = std::chrono::high_resolution_clock::now();
-            m_applyThreadBlockSize = thrBlockSize;
-            apply(tmpV, tmpD);
-            std::ignore = cudaDeviceSynchronize();
-            auto afterApply = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
-                if (durationInMicroSec < bestApplyTime){
-                    bestApplyTime = durationInMicroSec;
-                    bestApplyBlockSize = thrBlockSize;
-                }
-            }
-        }
-    }
-
-    m_applyThreadBlockSize = bestApplyBlockSize;
-    m_updateThreadBlockSize = bestUpdateBlockSize;
+    detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
+    detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/CuDILU.hpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.hpp
@ -137,8 +137,10 @@ private:
    bool m_tuneThreadBlockSizes;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
-    int m_applyThreadBlockSize = -1;
-    int m_updateThreadBlockSize = -1;
+    int m_upperSolveThreadBlockSize = -1;
+    int m_lowerSolveThreadBlockSize = -1;
+    int m_moveThreadBlockSize = -1;
+    int m_DILUFactorizationThreadBlockSize = -1;
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/CuILU0_OPM_Impl.cpp
+++ b/opm/simulators/linalg/cuistl/CuILU0_OPM_Impl.cpp
@ -22,6 +22,7 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/autotuner.hpp>
 #include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/CuILU0_OPM_Impl.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
@ -81,6 +82,9 @@ CuILU0_OPM_Impl<M, X, Y, l>::CuILU0_OPM_Impl(const M& A, bool splitMatrix, bool
    }
    computeDiagAndMoveReorderedData();

+    if (m_tuneThreadBlockSizes){
+        tuneThreadBlockSizes();
+    }
 #ifdef USE_HIP
    if (m_tuneThreadBlockSizes){
        tuneThreadBlockSizes();
@ -98,8 +102,12 @@ template <class M, class X, class Y, int l>
 void
 CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
 {
+    // ScopeTimer timer("Apply");
    OPM_TIMEBLOCK(prec_apply);
    {
+        // cudaDeviceSynchronize();
+        // auto start = std::chrono::high_resolution_clock::now();
+
        int levelStartIdx = 0;
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
@ -114,7 +122,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
                    m_gpuMatrixReorderedDiag.value().data(),
                    d.data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_lowerSolveThreadBlockSize);
            } else {
                detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(
                    m_gpuReorderedLU->getNonZeroValues().data(),
@ -125,7 +133,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
                    numOfRowsInLevel,
                    d.data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_lowerSolveThreadBlockSize);
            }
            levelStartIdx += numOfRowsInLevel;
        }
@ -145,7 +153,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
                    numOfRowsInLevel,
                    m_gpuMatrixReorderedDiag.value().data(),
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_upperSolveThreadBlockSize);
            } else {
                detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(
                    m_gpuReorderedLU->getNonZeroValues().data(),
@ -155,9 +163,13 @@ CuILU0_OPM_Impl<M, X, Y, l>::apply(X& v, const Y& d)
                    levelStartIdx,
                    numOfRowsInLevel,
                    v.data(),
-                    m_applyThreadBlockSize);
+                    m_upperSolveThreadBlockSize);
            }
        }
+        // cudaDeviceSynchronize();
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        // printf("Apply duration %ldus\n", duration);
    }
 }

@ -180,8 +192,16 @@ CuILU0_OPM_Impl<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
+        // cudaDeviceSynchronize();
+        // auto start = std::chrono::high_resolution_clock::now();
+
        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
        computeDiagAndMoveReorderedData();
+
+        // cudaDeviceSynchronize();
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        // printf("Update duration %ldus\n", duration);
    }
 }

@ -203,7 +223,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
                m_gpuMatrixReorderedDiag.value().data(),
                m_gpuNaturalToReorder.data(),
                m_gpuMatrixReorderedLower->N(),
-                m_updateThreadBlockSize);
+                m_moveThreadBlockSize);
        } else {
            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                    m_gpuMatrix.getRowIndices().data(),
@ -211,7 +231,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
                                                                    m_gpuReorderedLU->getRowIndices().data(),
                                                                    m_gpuNaturalToReorder.data(),
                                                                    m_gpuReorderedLU->N(),
-                                                                    m_updateThreadBlockSize);
+                                                                    m_moveThreadBlockSize);
        }
        int levelStartIdx = 0;
        for (int level = 0; level < m_levelSets.size(); ++level) {
@ -230,7 +250,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
                    m_gpuNaturalToReorder.data(),
                    levelStartIdx,
                    numOfRowsInLevel,
-                    m_updateThreadBlockSize);
+                    m_LUThreadBlockSize);

            } else {
                detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -240,7 +260,7 @@ CuILU0_OPM_Impl<M, X, Y, l>::computeDiagAndMoveReorderedData()
                                                                        m_gpuReorderToNatural.data(),
                                                                        numOfRowsInLevel,
                                                                        levelStartIdx,
-                                                                        m_updateThreadBlockSize);
+                                                                        m_LUThreadBlockSize);
            }
            levelStartIdx += numOfRowsInLevel;
        }
@ -251,52 +271,20 @@ template <class M, class X, class Y, int l>
 void
 CuILU0_OPM_Impl<M, X, Y, l>::tuneThreadBlockSizes()
 {
-    //TODO generalize this tuning process in a function separate of the class
-    long long bestApplyTime = __LONG_LONG_MAX__;
-    long long bestUpdateTime = __LONG_LONG_MAX__;
-    int bestApplyBlockSize = -1;
-    int bestUpdateBlockSize = -1;
-    int interval = 64;

-    //temporary buffers for the apply
+    using CuDILUType = std::remove_reference_t<decltype(*this)>;
+    auto updateFunc = std::bind(&CuDILUType::update, this);
+    auto applyFunc = std::bind(&CuDILUType::apply, this, std::placeholders::_1, std::placeholders::_1);
+
+    detail::tuneThreadBlockSize(updateFunc, m_moveThreadBlockSize);
+    detail::tuneThreadBlockSize(updateFunc, m_LUThreadBlockSize);
+
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

-    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
-        // sometimes the first kernel launch kan be slower, so take the time twice
-        for (int i = 0; i < 2; ++i){
-
-            auto beforeUpdate = std::chrono::high_resolution_clock::now();
-            m_updateThreadBlockSize = thrBlockSize;
-            update();
-            std::ignore = cudaDeviceSynchronize();
-            auto afterUpdate = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
-                if (durationInMicroSec < bestUpdateTime){
-                    bestUpdateTime = durationInMicroSec;
-                    bestUpdateBlockSize = thrBlockSize;
-                }
-            }
-
-            auto beforeApply = std::chrono::high_resolution_clock::now();
-            m_applyThreadBlockSize = thrBlockSize;
-            apply(tmpV, tmpD);
-            std::ignore = cudaDeviceSynchronize();
-            auto afterApply = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
-                if (durationInMicroSec < bestApplyTime){
-                    bestApplyTime = durationInMicroSec;
-                    bestApplyBlockSize = thrBlockSize;
-                }
-            }
-        }
-    }
-
-    m_applyThreadBlockSize = bestApplyBlockSize;
-    m_updateThreadBlockSize = bestUpdateBlockSize;
+    detail::tuneThreadBlockSize(applyFunc, m_lowerSolveThreadBlockSize, tmpV, tmpD);
+    detail::tuneThreadBlockSize(applyFunc, m_upperSolveThreadBlockSize, tmpV, tmpD);
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/CuILU0_OPM_Impl.hpp
+++ b/opm/simulators/linalg/cuistl/CuILU0_OPM_Impl.hpp
@ -135,8 +135,10 @@ private:
    bool m_tuneThreadBlockSizes;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
-    int m_applyThreadBlockSize = -1;
-    int m_updateThreadBlockSize = -1;
+    int m_upperSolveThreadBlockSize = -1;
+    int m_lowerSolveThreadBlockSize = -1;
+    int m_moveThreadBlockSize = -1;
+    int m_LUThreadBlockSize = -1;
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/CuSeqILU0.cpp
+++ b/opm/simulators/linalg/cuistl/CuSeqILU0.cpp
@ -33,6 +33,8 @@
 #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>

+#include <chrono>
+
 // This file is based on the guide at https://docs.nvidia.com/cuda/cusparse/index.html#csrilu02_solve ,
 // it highly recommended to read that before proceeding.

@ -78,6 +80,9 @@ template <class M, class X, class Y, int l>
 void
 CuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
+    cudaDeviceSynchronize();
+        auto start = std::chrono::high_resolution_clock::now();
+
    // We need to pass the solve routine a scalar to multiply.
    // In our case this scalar is 1.0
    const field_type one = 1.0;
@ -128,6 +133,10 @@ CuSeqILU0<M, X, Y, l>::apply(X& v, const Y& d)


    v *= m_w;
+    cudaDeviceSynchronize();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+    printf("Apply duration %ldus\n", duration);
 }

 template <class M, class X, class Y, int l>
@ -147,8 +156,16 @@ template <class M, class X, class Y, int l>
 void
 CuSeqILU0<M, X, Y, l>::update()
 {
+    cudaDeviceSynchronize();
+        auto start = std::chrono::high_resolution_clock::now();
+
    m_LU.updateNonzeroValues(detail::makeMatrixWithNonzeroDiagonal(m_underlyingMatrix));
    createILU();
+
+    cudaDeviceSynchronize();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+    printf("Update duration %ldus\n", duration);
 }

 template <class M, class X, class Y, int l>
--- a/opm/simulators/linalg/cuistl/detail/autotuner.hpp
+++ b/opm/simulators/linalg/cuistl/detail/autotuner.hpp
@ -0,0 +1,91 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_AUTOTUNER_HPP
+#define OPM_AUTOTUNER_HPP
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <functional>
+#include <utility>
+#include <limits>
+
+namespace Opm::cuistl::detail
+{
+
+    /// @brief Function that tests the best thread block size, assumes updating the reference will affect runtimes
+    /// @tparam func function to tune
+    /// @tparam ...Args types of the arguments needed to call the function
+    /// @param f the function to tune
+    /// @param threadBlockSize reference to the thread block size that will affect kernel executions
+    /// @param ...args arguments needed by the function
+    template <typename func, typename... Args>
+    void tuneThreadBlockSize(func f, int& threadBlockSize, Args&&... args) {
+
+        // decide on a number of calibration runs and allocate space for the events
+        constexpr const int runs = 2;
+        cudaEvent_t events[runs+1];
+
+        // create the events
+        for (int i = 0; i < runs + 1; ++i){
+            OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
+        }
+
+        // Initialize helper variables
+        float bestTime = std::numeric_limits<float>::max();
+        int bestBlockSize = -1;
+        int interval = 64;
+
+        // try each possible blocksize
+        for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
+            // update the blocksize
+            threadBlockSize = thrBlockSize;
+
+            // record a first event, and then an event after each kernel
+            OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
+            for (int i = 0; i < runs; ++i){
+                f(std::forward<Args>(args)...); // runs an arbitrary function with the provided arguments
+                OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i+1]));
+            }
+
+            // make suret he runs are over
+            OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));
+
+            // kernel launch was valid
+            if (cudaSuccess == cudaGetLastError()){
+                // check if we beat the record for the fastest kernel
+                for (int i = 0; i < runs; ++i){
+                    float candidateBlockSizeTime;
+                    OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i+1]));
+                    if (candidateBlockSizeTime < bestTime){
+                        bestTime = candidateBlockSizeTime;
+                        bestBlockSize = thrBlockSize;
+                    }
+                }
+            }
+        }
+        printf("best size: %d, best time %f\n", bestBlockSize, bestTime);
+
+        threadBlockSize = bestBlockSize;
+    }
+
+} // end namespace Opm::cuistl::detail
+
+#endif
--- a/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp
+++ b/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp
@ -41,7 +41,7 @@ extracting sparsity structures from dune matrices and creating cusparsematrix in
 namespace Opm::cuistl::detail
 {
    inline std::vector<int>
-    createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
+    createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
    {
        auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
        int globCnt = 0;
@ -56,7 +56,7 @@ namespace Opm::cuistl::detail
    }

    inline std::vector<int>
-    createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
+    createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
    {
        auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
        int globCnt = 0;
@ -73,7 +73,7 @@ namespace Opm::cuistl::detail
    template <class M, class field_type, class GPUM>
    inline void
    createReorderedMatrix(const M& naturalMatrix,
-                        std::vector<int> reorderedToNatural,
+                        std::vector<int>& reorderedToNatural,
                        std::unique_ptr<GPUM>& reorderedGpuMat)
    {
        M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
@ -91,7 +91,7 @@ namespace Opm::cuistl::detail
    template <class M, class field_type, class GPUM>
    inline void
    extractLowerAndUpperMatrices(const M& naturalMatrix,
-                                std::vector<int> reorderedToNatural,
+                                std::vector<int>& reorderedToNatural,
                                std::unique_ptr<GPUM>& lower,
                                std::unique_ptr<GPUM>& upper)
    {