Xiaoxia/Custom setting nstreams with hyper thread enabled on Hybrid-aware P/E-core (#13836)

* add HybridAware stream setting and core binding

* fix clang format issue

* unified code style, add parameter check

* correct input affinity skip NUMA, modify function name

* remove unnecessary floor

* fix ci compile issue on Mac/windows platform

* modify smoke_SetConfigAffinity test

* modify ov_core_set_and_get_property_enum test, affinity HYBRID_AWARE is changed to NUMA

* remove affinity correcting on this pr

* revert ov_core_test.cpp

* merge function by comments

* fix code style issue

* add custom nstreams setting, remove mutable qualifier

* fix code style issue

* fix some issues according to comments

* modify UpdateHybridCustomThreads

* fix code style issue

* modify comments

* use logic core in phase 2

* modify steams threads setting method

* add custom setting nstreams nthreads

* modify comments

* fix code style issue

* fix threads_per_stream_big is wrong in the some ADL machine which has 6 pcore

* add one condition that custom set nthreads but not set nstreams

* remove the limit to nstreams

* remove the feature in branch p_e_core_phase2_logic

* add comments

* rename num_small_cores to num_small_cores_phys

* fix code style issue

* fix thread number is wrong in corner case on 6Pcore and 10Pcore condition

* fix some condition which nstreams<2

* add check for config._threads_per_stream_big

Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com>
This commit is contained in:
Sun Xiaoxia 2022-11-13 01:55:54 +08:00 committed by GitHub
parent bb00a9e664
commit 966548d061
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 67 additions and 31 deletions

View File

@ -84,6 +84,7 @@ public:
static Config MakeDefaultMultiThreaded(const Config& initial, const bool fp_intesive = true);
static int GetDefaultNumStreams(); // no network specifics considered (only CPU's caps);
static int GetHybridNumStreams(std::map<std::string, std::string>& config, const int stream_mode);
static void UpdateHybridCustomThreads(Config& config);
std::string _name; //!< Used by `ITT` to name executor threads
int _streams = 1; //!< Number of streams.

View File

@ -96,20 +96,12 @@ struct CPUStreamsExecutor::Impl {
} else {
// assigning the stream to the core type in the round-robin fashion
// wrapping around total_streams (i.e. how many streams all different core types can handle
// together)
// together). Binding priority: Big core, Logical big core, Small core
const auto total_streams = _impl->total_streams_on_core_types.back().second;
const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
const auto phy_core_streams = big_core_streams / 2;
const auto streamId_wrapped = _streamId % total_streams;
// This is default setting for specific CPU which Pcore is in front and Ecore is in the back
const auto num_big_cores_phy = _impl->_config._small_core_offset / 2;
const auto use_logic_core =
_impl->_config._threads_per_stream_big * _impl->_config._big_core_streams > num_big_cores_phy;
const auto phy_core_streams = _impl->_config._big_core_streams / 2;
// current stream is placed on logical core
const auto cur_logic_core =
use_logic_core ? (streamId_wrapped >= phy_core_streams ? true : false) : false;
const auto small_core_skip =
_impl->_config._threads_per_stream_small == 3 && _impl->_config._small_core_streams;
const auto& selected_core_type =
std::find_if(
_impl->total_streams_on_core_types.cbegin(),
@ -118,31 +110,30 @@ struct CPUStreamsExecutor::Impl {
return p.second > streamId_wrapped;
})
->first;
const auto small_core = hybrid_core && selected_core_type == 0;
const auto logic_core = !small_core && streamId_wrapped >= phy_core_streams;
const auto small_core_skip = small_core && _impl->_config._threads_per_stream_small == 3 &&
_impl->_config._small_core_streams > 1;
const auto max_concurrency =
hybrid_core ? (selected_core_type == 0 ? _impl->_config._threads_per_stream_small
: _impl->_config._threads_per_stream_big)
: _impl->_config._threads_per_stream_big;
small_core ? _impl->_config._threads_per_stream_small : _impl->_config._threads_per_stream_big;
// Special handling of _threads_per_stream_small == 3
const auto small_core_id = (selected_core_type == 0 && small_core_skip)
? 0
: streamId_wrapped - _impl->_config._big_core_streams;
const auto small_core_id = small_core_skip ? 0 : streamId_wrapped - big_core_streams;
const auto stream_id =
hybrid_core ? (selected_core_type == 0
? small_core_id
: (cur_logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped))
: streamId_wrapped;
const auto thread_binding_step =
hybrid_core ? (selected_core_type == 0 ? _impl->_config._threadBindingStep : 2)
: _impl->_config._threadBindingStep;
// Special handling of _threads_per_stream_small == 3, need to skip 4
const auto small_core_offset = (selected_core_type == 0 && small_core_skip)
? _impl->_config._small_core_offset +
(streamId_wrapped - _impl->_config._big_core_streams) * 4
: _impl->_config._small_core_offset;
hybrid_core
? (small_core ? small_core_id
: (logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped))
: streamId_wrapped;
const auto thread_binding_step = hybrid_core ? (small_core ? _impl->_config._threadBindingStep : 2)
: _impl->_config._threadBindingStep;
// Special handling of _threads_per_stream_small == 3, need to skip 4 (Four cores share one L2 cache
// on the small core), stream_id = 0, cpu_idx_offset cumulative plus 4
const auto small_core_offset =
small_core_skip ? _impl->_config._small_core_offset + (streamId_wrapped - big_core_streams) * 4
: _impl->_config._small_core_offset;
const auto cpu_idx_offset =
hybrid_core
// Prevent conflicts with system scheduling, so default cpu id on big core starts from 1
? (selected_core_type == 0 ? small_core_offset : (cur_logic_core ? 0 : 1))
? (small_core ? small_core_offset : (logic_core ? 0 : 1))
: 0;
_taskArena.reset(new custom::task_arena{custom::task_arena::constraints{}
@ -254,13 +245,25 @@ struct CPUStreamsExecutor::Impl {
#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
const auto core_types = custom::info::core_types();
const auto num_core_phys = getNumberOfCPUCores();
const auto num_big_core_phys = getNumberOfCPUCores(true);
const auto num_small_core_phys = num_core_phys - num_big_core_phys;
int sum = 0;
// reversed order, so BIG cores are first
for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) {
const auto& type = *iter;
// calculating the #streams per core type
const int num_streams_for_core_type =
type == 0 ? std::max(1, config._small_core_streams) : std::max(1, config._big_core_streams);
type == 0 ? std::max(1,
std::min(config._small_core_streams,
config._threads_per_stream_small == 0
? 0
: num_small_core_phys / config._threads_per_stream_small))
: std::max(1,
std::min(config._big_core_streams,
config._threads_per_stream_big == 0
? 0
: num_big_core_phys / config._threads_per_stream_big * 2));
sum += num_streams_for_core_type;
// prefix sum, so the core type for a given stream id will be deduced just as a upper_bound
// (notice that the map keeps the elements in the descending order, so the big cores are populated

View File

@ -334,6 +334,34 @@ Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) const {
return {};
}
void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) {
const auto num_cores = parallel_get_max_threads();
const auto num_cores_phys = getNumberOfCPUCores();
const auto num_big_cores_phys = getNumberOfCPUCores(true);
const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys;
const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys;
const auto threads = config._threads ? config._threads : num_cores;
const auto streams = config._streams > 0 ? config._streams : 1;
config._small_core_offset = num_big_cores;
const int threads_per_stream = std::max(1, threads / streams);
const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
const int base_big_streams = num_cores > num_cores_phys
? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2
: (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big;
const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0;
const int base_streams = base_big_streams + base_small_streams;
// big_streams = all_streams * base_big_streams / base_streams
config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams;
config._small_core_streams = config._streams - config._big_core_streams;
// _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big
config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0)
? std::min(num_big_cores_phys, num_big_cores / base_big_streams)
: threads_per_stream_big;
config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0;
}
IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,
const bool fp_intesive) {
const auto envThreads = parallel_get_env_threads();
@ -369,6 +397,10 @@ IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(cons
custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back()));
num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys;
}
// if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here
if (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads) {
UpdateHybridCustomThreads(streamExecutorConfig);
}
OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "("
<< streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams +
streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams