Limit the scope of hybrid_aware throughput setting (#14054)

* fix custom setting nstreams doesn't work due to the logic changed in ApplyPerformanceHints

* add same limitation for latency mode

* change bind core method to binding full physical core first, then logical core

* update custom setting streams and threads

* fix corner case when threads_per_stream is 1

* fix with comments

* add condition of big core stream is zero when binding core

Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com>
Co-authored-by: Chen Peter <peter.chen@intel.com>
This commit is contained in:
Sun Xiaoxia 2022-11-19 07:26:43 +08:00 committed by GitHub
parent 96ad308380
commit 9d8a03f90c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 68 additions and 18 deletions

View File

@ -100,7 +100,10 @@ struct CPUStreamsExecutor::Impl {
const auto total_streams = _impl->total_streams_on_core_types.back().second;
const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
const auto phy_core_streams = big_core_streams / 2;
const auto phy_core_streams =
_impl->_config._big_core_streams == 0
? 0
: _impl->num_big_core_phys / _impl->_config._threads_per_stream_big;
const auto streamId_wrapped = _streamId % total_streams;
const auto& selected_core_type =
std::find_if(
@ -246,7 +249,7 @@ struct CPUStreamsExecutor::Impl {
if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
const auto core_types = custom::info::core_types();
const auto num_core_phys = getNumberOfCPUCores();
const auto num_big_core_phys = getNumberOfCPUCores(true);
num_big_core_phys = getNumberOfCPUCores(true);
const auto num_small_core_phys = num_core_phys - num_big_core_phys;
int sum = 0;
// reversed order, so BIG cores are first
@ -350,6 +353,7 @@ struct CPUStreamsExecutor::Impl {
// (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams)
using StreamIdToCoreTypes = std::vector<std::pair<custom::core_type_id, int>>;
StreamIdToCoreTypes total_streams_on_core_types;
int num_big_core_phys;
#endif
ExecutorManager::Ptr _exectorMgr;
};

View File

@ -344,22 +344,57 @@ void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) {
const auto streams = config._streams > 0 ? config._streams : 1;
config._small_core_offset = num_big_cores;
const int threads_per_stream = std::max(1, threads / streams);
const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
const int base_big_streams = num_cores > num_cores_phys
? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2
: (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big;
const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0;
const int base_streams = base_big_streams + base_small_streams;
// big_streams = all_streams * base_big_streams / base_streams
config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams;
config._small_core_streams = config._streams - config._big_core_streams;
// _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big
config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0)
? std::min(num_big_cores_phys, num_big_cores / base_big_streams)
: threads_per_stream_big;
config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0;
int threads_per_stream = std::max(1, threads / streams);
if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) {
config._big_core_streams = streams;
config._threads_per_stream_big = threads_per_stream;
config._small_core_streams = 0;
config._threads_per_stream_small = 0;
} else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) {
config._big_core_streams = 0;
config._threads_per_stream_big = 0;
config._small_core_streams = streams;
config._threads_per_stream_small = threads_per_stream;
} else {
const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small);
while (threads_per_stream > 1) {
const int base_big_streams = num_big_cores_phys / threads_per_stream;
const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0;
if (base_big_streams + base_small_streams >= streams) {
config._big_core_streams = base_big_streams;
config._small_core_streams = streams - base_big_streams;
break;
} else if (base_big_streams * 2 + base_small_streams >= streams) {
config._big_core_streams = streams - base_small_streams;
config._small_core_streams = base_small_streams;
break;
} else {
threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1;
}
}
if (threads_per_stream == 1) {
const int stream_loops = streams / num_cores;
const int remain_streams = streams - stream_loops * num_cores;
if (num_big_cores_phys >= remain_streams) {
config._big_core_streams = remain_streams + num_big_cores * stream_loops;
config._small_core_streams = num_small_cores_phys * stream_loops;
} else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) {
config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops;
config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops;
} else {
config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops;
config._small_core_streams = num_small_cores_phys * (stream_loops + 1);
}
}
config._threads_per_stream_big = threads_per_stream;
config._threads_per_stream_small = threads_per_stream;
}
}
IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,

View File

@ -803,13 +803,24 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c
hints_props.insert({tput_name, tput_hints.second});
ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
auto resetHybridParam = [&]() {
config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(0);
config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(0);
config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(0);
config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(0);
config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(0);
};
const auto perf_hint_name = getPerfHintName();
if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = latency_hints.first;
config[ov::num_streams.name()] = latency_hints.second;
resetHybridParam();
} else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = tput_hints.first;
config[ov::num_streams.name()] = tput_hints.first;
} else {
resetHybridParam();
}
}