diff --git a/src/inference/src/threading/ie_cpu_streams_executor.cpp b/src/inference/src/threading/ie_cpu_streams_executor.cpp index 4c302f66895..23fcb15fa76 100644 --- a/src/inference/src/threading/ie_cpu_streams_executor.cpp +++ b/src/inference/src/threading/ie_cpu_streams_executor.cpp @@ -100,7 +100,10 @@ struct CPUStreamsExecutor::Impl { const auto total_streams = _impl->total_streams_on_core_types.back().second; const auto big_core_streams = _impl->total_streams_on_core_types.front().second; const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1; - const auto phy_core_streams = big_core_streams / 2; + const auto phy_core_streams = + _impl->_config._big_core_streams == 0 + ? 0 + : _impl->num_big_core_phys / _impl->_config._threads_per_stream_big; const auto streamId_wrapped = _streamId % total_streams; const auto& selected_core_type = std::find_if( @@ -246,7 +249,7 @@ struct CPUStreamsExecutor::Impl { if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) { const auto core_types = custom::info::core_types(); const auto num_core_phys = getNumberOfCPUCores(); - const auto num_big_core_phys = getNumberOfCPUCores(true); + num_big_core_phys = getNumberOfCPUCores(true); const auto num_small_core_phys = num_core_phys - num_big_core_phys; int sum = 0; // reversed order, so BIG cores are first @@ -350,6 +353,7 @@ struct CPUStreamsExecutor::Impl { // (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams) using StreamIdToCoreTypes = std::vector>; StreamIdToCoreTypes total_streams_on_core_types; + int num_big_core_phys; #endif ExecutorManager::Ptr _exectorMgr; }; diff --git a/src/inference/src/threading/ie_istreams_executor.cpp b/src/inference/src/threading/ie_istreams_executor.cpp index 1e65c2b58ba..8af9b7d5a09 100644 --- a/src/inference/src/threading/ie_istreams_executor.cpp +++ b/src/inference/src/threading/ie_istreams_executor.cpp @@ -344,22 +344,57 @@ void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) { const auto streams = config._streams > 0 ? config._streams : 1; config._small_core_offset = num_big_cores; - const int threads_per_stream = std::max(1, threads / streams); - const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream); - const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream); - const int base_big_streams = num_cores > num_cores_phys - ? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2 - : (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big; - const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0; - const int base_streams = base_big_streams + base_small_streams; - // big_streams = all_streams * base_big_streams / base_streams - config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams; - config._small_core_streams = config._streams - config._big_core_streams; - // _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big - config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0) - ? std::min(num_big_cores_phys, num_big_cores / base_big_streams) - : threads_per_stream_big; - config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0; + int threads_per_stream = std::max(1, threads / streams); + + if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) { + config._big_core_streams = streams; + config._threads_per_stream_big = threads_per_stream; + config._small_core_streams = 0; + config._threads_per_stream_small = 0; + } else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) { + config._big_core_streams = 0; + config._threads_per_stream_big = 0; + config._small_core_streams = streams; + config._threads_per_stream_small = threads_per_stream; + } else { + const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream); + const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream); + + threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small); + while (threads_per_stream > 1) { + const int base_big_streams = num_big_cores_phys / threads_per_stream; + const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0; + if (base_big_streams + base_small_streams >= streams) { + config._big_core_streams = base_big_streams; + config._small_core_streams = streams - base_big_streams; + break; + } else if (base_big_streams * 2 + base_small_streams >= streams) { + config._big_core_streams = streams - base_small_streams; + config._small_core_streams = base_small_streams; + break; + } else { + threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1; + } + } + + if (threads_per_stream == 1) { + const int stream_loops = streams / num_cores; + const int remain_streams = streams - stream_loops * num_cores; + if (num_big_cores_phys >= remain_streams) { + config._big_core_streams = remain_streams + num_big_cores * stream_loops; + config._small_core_streams = num_small_cores_phys * stream_loops; + } else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) { + config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops; + config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops; + } else { + config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops; + config._small_core_streams = num_small_cores_phys * (stream_loops + 1); + } + } + + config._threads_per_stream_big = threads_per_stream; + config._threads_per_stream_small = threads_per_stream; + } } IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial, diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 49e34e0c839..117a4e566de 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -803,13 +803,24 @@ void Engine::ApplyPerformanceHints(std::map &config, c hints_props.insert({tput_name, tput_hints.second}); ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config"); + auto resetHybridParam = [&]() { + config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(0); + config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(0); + config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(0); + config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(0); + config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(0); + }; + const auto perf_hint_name = getPerfHintName(); if (perf_hint_name == CONFIG_VALUE(LATENCY)) { config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = latency_hints.first; config[ov::num_streams.name()] = latency_hints.second; + resetHybridParam(); } else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) { config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = tput_hints.first; config[ov::num_streams.name()] = tput_hints.first; + } else { + resetHybridParam(); } }