Xiaoxia/Custom setting nstreams with hyper thread enabled on Hybrid-aware P/E-core (#13836)
* add HybridAware stream setting and core binding * fix clang format issue * unified code style, add parameter check * correct input affinity skip NUMA, modify function name * remove unnecessary floor * fix ci compile issue on Mac/windows platform * modify smoke_SetConfigAffinity test * modify ov_core_set_and_get_property_enum test, affinity HYBRID_AWARE is changed to NUMA * remove affinity correcting on this pr * revert ov_core_test.cpp * merge function by comments * fix code style issue * add custom nstreams setting, remove mutable qualifier * fix code style issue * fix some issues according to comments * modify UpdateHybridCustomThreads * fix code style issue * modify comments * use logic core in phase 2 * modify steams threads setting method * add custom setting nstreams nthreads * modify comments * fix code style issue * fix threads_per_stream_big is wrong in the some ADL machine which has 6 pcore * add one condition that custom set nthreads but not set nstreams * remove the limit to nstreams * remove the feature in branch p_e_core_phase2_logic * add comments * rename num_small_cores to num_small_cores_phys * fix code style issue * fix thread number is wrong in corner case on 6Pcore and 10Pcore condition * fix some condition which nstreams<2 * add check for config._threads_per_stream_big Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com>
This commit is contained in:
parent
bb00a9e664
commit
966548d061
@ -84,6 +84,7 @@ public:
|
||||
static Config MakeDefaultMultiThreaded(const Config& initial, const bool fp_intesive = true);
|
||||
static int GetDefaultNumStreams(); // no network specifics considered (only CPU's caps);
|
||||
static int GetHybridNumStreams(std::map<std::string, std::string>& config, const int stream_mode);
|
||||
static void UpdateHybridCustomThreads(Config& config);
|
||||
|
||||
std::string _name; //!< Used by `ITT` to name executor threads
|
||||
int _streams = 1; //!< Number of streams.
|
||||
|
@ -96,20 +96,12 @@ struct CPUStreamsExecutor::Impl {
|
||||
} else {
|
||||
// assigning the stream to the core type in the round-robin fashion
|
||||
// wrapping around total_streams (i.e. how many streams all different core types can handle
|
||||
// together)
|
||||
// together). Binding priority: Big core, Logical big core, Small core
|
||||
const auto total_streams = _impl->total_streams_on_core_types.back().second;
|
||||
const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
|
||||
const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
|
||||
const auto phy_core_streams = big_core_streams / 2;
|
||||
const auto streamId_wrapped = _streamId % total_streams;
|
||||
// This is default setting for specific CPU which Pcore is in front and Ecore is in the back
|
||||
const auto num_big_cores_phy = _impl->_config._small_core_offset / 2;
|
||||
const auto use_logic_core =
|
||||
_impl->_config._threads_per_stream_big * _impl->_config._big_core_streams > num_big_cores_phy;
|
||||
const auto phy_core_streams = _impl->_config._big_core_streams / 2;
|
||||
// current stream is placed on logical core
|
||||
const auto cur_logic_core =
|
||||
use_logic_core ? (streamId_wrapped >= phy_core_streams ? true : false) : false;
|
||||
const auto small_core_skip =
|
||||
_impl->_config._threads_per_stream_small == 3 && _impl->_config._small_core_streams;
|
||||
const auto& selected_core_type =
|
||||
std::find_if(
|
||||
_impl->total_streams_on_core_types.cbegin(),
|
||||
@ -118,31 +110,30 @@ struct CPUStreamsExecutor::Impl {
|
||||
return p.second > streamId_wrapped;
|
||||
})
|
||||
->first;
|
||||
const auto small_core = hybrid_core && selected_core_type == 0;
|
||||
const auto logic_core = !small_core && streamId_wrapped >= phy_core_streams;
|
||||
const auto small_core_skip = small_core && _impl->_config._threads_per_stream_small == 3 &&
|
||||
_impl->_config._small_core_streams > 1;
|
||||
const auto max_concurrency =
|
||||
hybrid_core ? (selected_core_type == 0 ? _impl->_config._threads_per_stream_small
|
||||
: _impl->_config._threads_per_stream_big)
|
||||
: _impl->_config._threads_per_stream_big;
|
||||
small_core ? _impl->_config._threads_per_stream_small : _impl->_config._threads_per_stream_big;
|
||||
// Special handling of _threads_per_stream_small == 3
|
||||
const auto small_core_id = (selected_core_type == 0 && small_core_skip)
|
||||
? 0
|
||||
: streamId_wrapped - _impl->_config._big_core_streams;
|
||||
const auto small_core_id = small_core_skip ? 0 : streamId_wrapped - big_core_streams;
|
||||
const auto stream_id =
|
||||
hybrid_core ? (selected_core_type == 0
|
||||
? small_core_id
|
||||
: (cur_logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped))
|
||||
: streamId_wrapped;
|
||||
const auto thread_binding_step =
|
||||
hybrid_core ? (selected_core_type == 0 ? _impl->_config._threadBindingStep : 2)
|
||||
: _impl->_config._threadBindingStep;
|
||||
// Special handling of _threads_per_stream_small == 3, need to skip 4
|
||||
const auto small_core_offset = (selected_core_type == 0 && small_core_skip)
|
||||
? _impl->_config._small_core_offset +
|
||||
(streamId_wrapped - _impl->_config._big_core_streams) * 4
|
||||
: _impl->_config._small_core_offset;
|
||||
hybrid_core
|
||||
? (small_core ? small_core_id
|
||||
: (logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped))
|
||||
: streamId_wrapped;
|
||||
const auto thread_binding_step = hybrid_core ? (small_core ? _impl->_config._threadBindingStep : 2)
|
||||
: _impl->_config._threadBindingStep;
|
||||
// Special handling of _threads_per_stream_small == 3, need to skip 4 (Four cores share one L2 cache
|
||||
// on the small core), stream_id = 0, cpu_idx_offset cumulative plus 4
|
||||
const auto small_core_offset =
|
||||
small_core_skip ? _impl->_config._small_core_offset + (streamId_wrapped - big_core_streams) * 4
|
||||
: _impl->_config._small_core_offset;
|
||||
const auto cpu_idx_offset =
|
||||
hybrid_core
|
||||
// Prevent conflicts with system scheduling, so default cpu id on big core starts from 1
|
||||
? (selected_core_type == 0 ? small_core_offset : (cur_logic_core ? 0 : 1))
|
||||
? (small_core ? small_core_offset : (logic_core ? 0 : 1))
|
||||
: 0;
|
||||
|
||||
_taskArena.reset(new custom::task_arena{custom::task_arena::constraints{}
|
||||
@ -254,13 +245,25 @@ struct CPUStreamsExecutor::Impl {
|
||||
#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
|
||||
if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
|
||||
const auto core_types = custom::info::core_types();
|
||||
const auto num_core_phys = getNumberOfCPUCores();
|
||||
const auto num_big_core_phys = getNumberOfCPUCores(true);
|
||||
const auto num_small_core_phys = num_core_phys - num_big_core_phys;
|
||||
int sum = 0;
|
||||
// reversed order, so BIG cores are first
|
||||
for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) {
|
||||
const auto& type = *iter;
|
||||
// calculating the #streams per core type
|
||||
const int num_streams_for_core_type =
|
||||
type == 0 ? std::max(1, config._small_core_streams) : std::max(1, config._big_core_streams);
|
||||
type == 0 ? std::max(1,
|
||||
std::min(config._small_core_streams,
|
||||
config._threads_per_stream_small == 0
|
||||
? 0
|
||||
: num_small_core_phys / config._threads_per_stream_small))
|
||||
: std::max(1,
|
||||
std::min(config._big_core_streams,
|
||||
config._threads_per_stream_big == 0
|
||||
? 0
|
||||
: num_big_core_phys / config._threads_per_stream_big * 2));
|
||||
sum += num_streams_for_core_type;
|
||||
// prefix sum, so the core type for a given stream id will be deduced just as a upper_bound
|
||||
// (notice that the map keeps the elements in the descending order, so the big cores are populated
|
||||
|
@ -334,6 +334,34 @@ Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) const {
|
||||
return {};
|
||||
}
|
||||
|
||||
void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) {
|
||||
const auto num_cores = parallel_get_max_threads();
|
||||
const auto num_cores_phys = getNumberOfCPUCores();
|
||||
const auto num_big_cores_phys = getNumberOfCPUCores(true);
|
||||
const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys;
|
||||
const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys;
|
||||
const auto threads = config._threads ? config._threads : num_cores;
|
||||
const auto streams = config._streams > 0 ? config._streams : 1;
|
||||
|
||||
config._small_core_offset = num_big_cores;
|
||||
const int threads_per_stream = std::max(1, threads / streams);
|
||||
const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
|
||||
const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
|
||||
const int base_big_streams = num_cores > num_cores_phys
|
||||
? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2
|
||||
: (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big;
|
||||
const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0;
|
||||
const int base_streams = base_big_streams + base_small_streams;
|
||||
// big_streams = all_streams * base_big_streams / base_streams
|
||||
config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams;
|
||||
config._small_core_streams = config._streams - config._big_core_streams;
|
||||
// _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big
|
||||
config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0)
|
||||
? std::min(num_big_cores_phys, num_big_cores / base_big_streams)
|
||||
: threads_per_stream_big;
|
||||
config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0;
|
||||
}
|
||||
|
||||
IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,
|
||||
const bool fp_intesive) {
|
||||
const auto envThreads = parallel_get_env_threads();
|
||||
@ -369,6 +397,10 @@ IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(cons
|
||||
custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back()));
|
||||
num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys;
|
||||
}
|
||||
// if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here
|
||||
if (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads) {
|
||||
UpdateHybridCustomThreads(streamExecutorConfig);
|
||||
}
|
||||
OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "("
|
||||
<< streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams +
|
||||
streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams
|
||||
|
Loading…
Reference in New Issue
Block a user