Limit the scope of hybrid_aware throughput setting (#14054)

* fix custom setting nstreams doesn't work due to the logic changed in ApplyPerformanceHints * add same limitation for latency mode * change bind core method to binding full physical core first, then logical core * update custom setting streams and threads * fix corner case when threads_per_stream is 1 * fix with comments * add condition of big core stream is zero when binding core Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com> Co-authored-by: Chen Peter <peter.chen@intel.com>
2022-11-19 07:26:43 +08:00 · 2022-11-19 07:26:43 +08:00 · 9d8a03f90c
commit 9d8a03f90c
parent 96ad308380
3 changed files with 68 additions and 18 deletions
--- a/src/inference/src/threading/ie_cpu_streams_executor.cpp
+++ b/src/inference/src/threading/ie_cpu_streams_executor.cpp
@ -100,7 +100,10 @@ struct CPUStreamsExecutor::Impl {
                    const auto total_streams = _impl->total_streams_on_core_types.back().second;
                    const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
                    const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
-                    const auto phy_core_streams = big_core_streams / 2;
+                    const auto phy_core_streams =
                        _impl->_config._big_core_streams == 0
                            ? 0
                            : _impl->num_big_core_phys / _impl->_config._threads_per_stream_big;
                    const auto streamId_wrapped = _streamId % total_streams;
                    const auto& selected_core_type =
                        std::find_if(
@ -246,7 +249,7 @@ struct CPUStreamsExecutor::Impl {
        if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
            const auto core_types = custom::info::core_types();
            const auto num_core_phys = getNumberOfCPUCores();
-            const auto num_big_core_phys = getNumberOfCPUCores(true);
+            num_big_core_phys = getNumberOfCPUCores(true);
            const auto num_small_core_phys = num_core_phys - num_big_core_phys;
            int sum = 0;
            // reversed order, so BIG cores are first
@ -350,6 +353,7 @@ struct CPUStreamsExecutor::Impl {
    // (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams)
    using StreamIdToCoreTypes = std::vector<std::pair<custom::core_type_id, int>>;
    StreamIdToCoreTypes total_streams_on_core_types;
    int num_big_core_phys;
 #endif
    ExecutorManager::Ptr _exectorMgr;
 };
--- a/src/inference/src/threading/ie_istreams_executor.cpp
+++ b/src/inference/src/threading/ie_istreams_executor.cpp
@ -344,22 +344,57 @@ void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) {
    const auto streams = config._streams > 0 ? config._streams : 1;
    config._small_core_offset = num_big_cores;
-    const int threads_per_stream = std::max(1, threads / streams);
+    int threads_per_stream = std::max(1, threads / streams);
-    const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
+
-    const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
+    if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) {
-    const int base_big_streams = num_cores > num_cores_phys
+        config._big_core_streams = streams;
-                                     ? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2
+        config._threads_per_stream_big = threads_per_stream;
-                                     : (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big;
+        config._small_core_streams = 0;
-    const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0;
+        config._threads_per_stream_small = 0;
-    const int base_streams = base_big_streams + base_small_streams;
+    } else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) {
-    // big_streams = all_streams * base_big_streams / base_streams
+        config._big_core_streams = 0;
-    config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams;
+        config._threads_per_stream_big = 0;
-    config._small_core_streams = config._streams - config._big_core_streams;
+        config._small_core_streams = streams;
-    // _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big
+        config._threads_per_stream_small = threads_per_stream;
-    config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0)
+    } else {
-                                         ? std::min(num_big_cores_phys, num_big_cores / base_big_streams)
+        const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
-                                         : threads_per_stream_big;
+        const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
-    config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0;
+
        threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small);
        while (threads_per_stream > 1) {
            const int base_big_streams = num_big_cores_phys / threads_per_stream;
            const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0;
            if (base_big_streams + base_small_streams >= streams) {
                config._big_core_streams = base_big_streams;
                config._small_core_streams = streams - base_big_streams;
                break;
            } else if (base_big_streams * 2 + base_small_streams >= streams) {
                config._big_core_streams = streams - base_small_streams;
                config._small_core_streams = base_small_streams;
                break;
            } else {
                threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1;
            }
        }
        if (threads_per_stream == 1) {
            const int stream_loops = streams / num_cores;
            const int remain_streams = streams - stream_loops * num_cores;
            if (num_big_cores_phys >= remain_streams) {
                config._big_core_streams = remain_streams + num_big_cores * stream_loops;
                config._small_core_streams = num_small_cores_phys * stream_loops;
            } else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) {
                config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops;
                config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops;
            } else {
                config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops;
                config._small_core_streams = num_small_cores_phys * (stream_loops + 1);
            }
        }
        config._threads_per_stream_big = threads_per_stream;
        config._threads_per_stream_small = threads_per_stream;
    }
 }
 IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@ -803,13 +803,24 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c
    hints_props.insert({tput_name, tput_hints.second});
    ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
    auto resetHybridParam = [&]() {
        config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(0);
        config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(0);
        config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(0);
        config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(0);
        config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(0);
    };
    const auto perf_hint_name = getPerfHintName();
    if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
        config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = latency_hints.first;
        config[ov::num_streams.name()] = latency_hints.second;
        resetHybridParam();
    } else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
        config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = tput_hints.first;
        config[ov::num_streams.name()] = tput_hints.first;
    } else {
        resetHybridParam();
    }
 }