clang format autotuner

2025-02-25 18:55:30 -06:00 · 2024-08-22 10:23:36 +02:00 · 2024-08-22 10:23:36 +02:00 · 45f6116d5e
commit 45f6116d5e
parent dcdce71d4b
1 changed files with 54 additions and 51 deletions
--- a/opm/simulators/linalg/cuistl/detail/autotuner.hpp
+++ b/opm/simulators/linalg/cuistl/detail/autotuner.hpp
@ -17,74 +17,77 @@

 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/common/OpmLog/OpmLog.hpp>
 #include <functional>
-#include <utility>
 #include <limits>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <string>
+#include <utility>

 namespace Opm::cuistl::detail
 {

-    /// @brief Function that tests the best thread block size, assumes the provided function depends on threadblock-size
-    /// @tparam The type of the function to tune
-    /// @param f the function to tune, which takes the thread block size as the input
-    template <typename func>
-    int tuneThreadBlockSize(func& f, std::string descriptionOfFunction) {
-        // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
-        // 64 is chosen so it is a multiple of the AMD wavefront size.
-        // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
-        // We time the kernel with each possible threadblock-size, and return the one
-        // that gave the fastest invidivual run.
+/// @brief Function that tests the best thread block size, assumes the provided function depends on threadblock-size
+/// @tparam The type of the function to tune
+/// @param f the function to tune, which takes the thread block size as the input
+template <typename func>
+int
+tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
+{
+    // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
+    // 64 is chosen so it is a multiple of the AMD wavefront size.
+    // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
+    // We time the kernel with each possible threadblock-size, and return the one
+    // that gave the fastest invidivual run.

-        // TODO: figure out a more rigorous way of deciding how many runs will suffice?
-        constexpr const int runs = 2;
-        cudaEvent_t events[runs+1];
+    // TODO: figure out a more rigorous way of deciding how many runs will suffice?
+    constexpr const int runs = 2;
+    cudaEvent_t events[runs + 1];

-        // create the events
-        for (int i = 0; i < runs + 1; ++i){
-            OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
+    // create the events
+    for (int i = 0; i < runs + 1; ++i) {
+        OPM_CUDA_SAFE_CALL(cudaEventCreate(&events[i]));
+    }
+
+    // Initialize helper variables
+    float bestTime = std::numeric_limits<float>::max();
+    int bestBlockSize = -1;
+    int interval = 64;
+
+    // try each possible blocksize
+    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
+
+        // record a first event, and then an event after each kernel
+        OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
+        for (int i = 0; i < runs; ++i) {
+            f(thrBlockSize); // runs an arbitrary function with the provided arguments
+            OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i + 1]));
        }

-        // Initialize helper variables
-        float bestTime = std::numeric_limits<float>::max();
-        int bestBlockSize = -1;
-        int interval = 64;
+        // make suret he runs are over
+        OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));

-        // try each possible blocksize
-        for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
-
-            // record a first event, and then an event after each kernel
-            OPM_CUDA_SAFE_CALL(cudaEventRecord(events[0]));
-            for (int i = 0; i < runs; ++i){
-                f(thrBlockSize); // runs an arbitrary function with the provided arguments
-                OPM_CUDA_SAFE_CALL(cudaEventRecord(events[i+1]));
-            }
-
-            // make suret he runs are over
-            OPM_CUDA_SAFE_CALL(cudaEventSynchronize(events[runs]));
-
-            // kernel launch was valid
-            if (cudaSuccess == cudaGetLastError()){
-                // check if we beat the record for the fastest kernel
-                for (int i = 0; i < runs; ++i){
-                    float candidateBlockSizeTime;
-                    OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i+1]));
-                    if (candidateBlockSizeTime < bestTime){ // checks if this configuration beat the current best
-                        bestTime = candidateBlockSizeTime;
-                        bestBlockSize = thrBlockSize;
-                    }
+        // kernel launch was valid
+        if (cudaSuccess == cudaGetLastError()) {
+            // check if we beat the record for the fastest kernel
+            for (int i = 0; i < runs; ++i) {
+                float candidateBlockSizeTime;
+                OPM_CUDA_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
+                if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
+                    bestTime = candidateBlockSizeTime;
+                    bestBlockSize = thrBlockSize;
                }
            }
        }
-
-        OpmLog::info(fmt::format("{}: Tuned Blocksize: {} (fastest runtime: {}).", descriptionOfFunction, bestBlockSize, bestTime));
-
-        return bestBlockSize;
    }

+    OpmLog::info(
+        fmt::format("{}: Tuned Blocksize: {} (fastest runtime: {}).", descriptionOfFunction, bestBlockSize, bestTime));
+
+    return bestBlockSize;
+}
+
 } // end namespace Opm::cuistl::detail

 #endif