Limit the scope of hybrid_aware throughput setting (#14054)
* fix custom setting nstreams doesn't work due to the logic changed in ApplyPerformanceHints * add same limitation for latency mode * change bind core method to binding full physical core first, then logical core * update custom setting streams and threads * fix corner case when threads_per_stream is 1 * fix with comments * add condition of big core stream is zero when binding core Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com> Co-authored-by: Chen Peter <peter.chen@intel.com>
This commit is contained in:
parent
96ad308380
commit
9d8a03f90c
@ -100,7 +100,10 @@ struct CPUStreamsExecutor::Impl {
|
|||||||
const auto total_streams = _impl->total_streams_on_core_types.back().second;
|
const auto total_streams = _impl->total_streams_on_core_types.back().second;
|
||||||
const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
|
const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
|
||||||
const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
|
const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
|
||||||
const auto phy_core_streams = big_core_streams / 2;
|
const auto phy_core_streams =
|
||||||
|
_impl->_config._big_core_streams == 0
|
||||||
|
? 0
|
||||||
|
: _impl->num_big_core_phys / _impl->_config._threads_per_stream_big;
|
||||||
const auto streamId_wrapped = _streamId % total_streams;
|
const auto streamId_wrapped = _streamId % total_streams;
|
||||||
const auto& selected_core_type =
|
const auto& selected_core_type =
|
||||||
std::find_if(
|
std::find_if(
|
||||||
@ -246,7 +249,7 @@ struct CPUStreamsExecutor::Impl {
|
|||||||
if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
|
if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
|
||||||
const auto core_types = custom::info::core_types();
|
const auto core_types = custom::info::core_types();
|
||||||
const auto num_core_phys = getNumberOfCPUCores();
|
const auto num_core_phys = getNumberOfCPUCores();
|
||||||
const auto num_big_core_phys = getNumberOfCPUCores(true);
|
num_big_core_phys = getNumberOfCPUCores(true);
|
||||||
const auto num_small_core_phys = num_core_phys - num_big_core_phys;
|
const auto num_small_core_phys = num_core_phys - num_big_core_phys;
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
// reversed order, so BIG cores are first
|
// reversed order, so BIG cores are first
|
||||||
@ -350,6 +353,7 @@ struct CPUStreamsExecutor::Impl {
|
|||||||
// (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams)
|
// (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams)
|
||||||
using StreamIdToCoreTypes = std::vector<std::pair<custom::core_type_id, int>>;
|
using StreamIdToCoreTypes = std::vector<std::pair<custom::core_type_id, int>>;
|
||||||
StreamIdToCoreTypes total_streams_on_core_types;
|
StreamIdToCoreTypes total_streams_on_core_types;
|
||||||
|
int num_big_core_phys;
|
||||||
#endif
|
#endif
|
||||||
ExecutorManager::Ptr _exectorMgr;
|
ExecutorManager::Ptr _exectorMgr;
|
||||||
};
|
};
|
||||||
|
@ -344,22 +344,57 @@ void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) {
|
|||||||
const auto streams = config._streams > 0 ? config._streams : 1;
|
const auto streams = config._streams > 0 ? config._streams : 1;
|
||||||
|
|
||||||
config._small_core_offset = num_big_cores;
|
config._small_core_offset = num_big_cores;
|
||||||
const int threads_per_stream = std::max(1, threads / streams);
|
int threads_per_stream = std::max(1, threads / streams);
|
||||||
const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
|
|
||||||
const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
|
if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) {
|
||||||
const int base_big_streams = num_cores > num_cores_phys
|
config._big_core_streams = streams;
|
||||||
? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2
|
config._threads_per_stream_big = threads_per_stream;
|
||||||
: (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big;
|
config._small_core_streams = 0;
|
||||||
const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0;
|
config._threads_per_stream_small = 0;
|
||||||
const int base_streams = base_big_streams + base_small_streams;
|
} else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) {
|
||||||
// big_streams = all_streams * base_big_streams / base_streams
|
config._big_core_streams = 0;
|
||||||
config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams;
|
config._threads_per_stream_big = 0;
|
||||||
config._small_core_streams = config._streams - config._big_core_streams;
|
config._small_core_streams = streams;
|
||||||
// _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big
|
config._threads_per_stream_small = threads_per_stream;
|
||||||
config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0)
|
} else {
|
||||||
? std::min(num_big_cores_phys, num_big_cores / base_big_streams)
|
const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
|
||||||
: threads_per_stream_big;
|
const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
|
||||||
config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0;
|
|
||||||
|
threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small);
|
||||||
|
while (threads_per_stream > 1) {
|
||||||
|
const int base_big_streams = num_big_cores_phys / threads_per_stream;
|
||||||
|
const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0;
|
||||||
|
if (base_big_streams + base_small_streams >= streams) {
|
||||||
|
config._big_core_streams = base_big_streams;
|
||||||
|
config._small_core_streams = streams - base_big_streams;
|
||||||
|
break;
|
||||||
|
} else if (base_big_streams * 2 + base_small_streams >= streams) {
|
||||||
|
config._big_core_streams = streams - base_small_streams;
|
||||||
|
config._small_core_streams = base_small_streams;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threads_per_stream == 1) {
|
||||||
|
const int stream_loops = streams / num_cores;
|
||||||
|
const int remain_streams = streams - stream_loops * num_cores;
|
||||||
|
if (num_big_cores_phys >= remain_streams) {
|
||||||
|
config._big_core_streams = remain_streams + num_big_cores * stream_loops;
|
||||||
|
config._small_core_streams = num_small_cores_phys * stream_loops;
|
||||||
|
} else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) {
|
||||||
|
config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops;
|
||||||
|
config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops;
|
||||||
|
} else {
|
||||||
|
config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops;
|
||||||
|
config._small_core_streams = num_small_cores_phys * (stream_loops + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
config._threads_per_stream_big = threads_per_stream;
|
||||||
|
config._threads_per_stream_small = threads_per_stream;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,
|
IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,
|
||||||
|
@ -803,13 +803,24 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c
|
|||||||
hints_props.insert({tput_name, tput_hints.second});
|
hints_props.insert({tput_name, tput_hints.second});
|
||||||
ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
|
ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
|
||||||
|
|
||||||
|
auto resetHybridParam = [&]() {
|
||||||
|
config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(0);
|
||||||
|
config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(0);
|
||||||
|
config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(0);
|
||||||
|
config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(0);
|
||||||
|
config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(0);
|
||||||
|
};
|
||||||
|
|
||||||
const auto perf_hint_name = getPerfHintName();
|
const auto perf_hint_name = getPerfHintName();
|
||||||
if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
|
if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
|
||||||
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = latency_hints.first;
|
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = latency_hints.first;
|
||||||
config[ov::num_streams.name()] = latency_hints.second;
|
config[ov::num_streams.name()] = latency_hints.second;
|
||||||
|
resetHybridParam();
|
||||||
} else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
|
} else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
|
||||||
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = tput_hints.first;
|
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = tput_hints.first;
|
||||||
config[ov::num_streams.name()] = tput_hints.first;
|
config[ov::num_streams.name()] = tput_hints.first;
|
||||||
|
} else {
|
||||||
|
resetHybridParam();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user