Xiaoxia/Custom setting nstreams with hyper thread enabled on Hybrid-aware P/E-core (#13836)

* add HybridAware stream setting and core binding * fix clang format issue * unified code style, add parameter check * correct input affinity skip NUMA, modify function name * remove unnecessary floor * fix ci compile issue on Mac/windows platform * modify smoke_SetConfigAffinity test * modify ov_core_set_and_get_property_enum test, affinity HYBRID_AWARE is changed to NUMA * remove affinity correcting on this pr * revert ov_core_test.cpp * merge function by comments * fix code style issue * add custom nstreams setting, remove mutable qualifier * fix code style issue * fix some issues according to comments * modify UpdateHybridCustomThreads * fix code style issue * modify comments * use logic core in phase 2 * modify steams threads setting method * add custom setting nstreams nthreads * modify comments * fix code style issue * fix threads_per_stream_big is wrong in the some ADL machine which has 6 pcore * add one condition that custom set nthreads but not set nstreams * remove the limit to nstreams * remove the feature in branch p_e_core_phase2_logic * add comments * rename num_small_cores to num_small_cores_phys * fix code style issue * fix thread number is wrong in corner case on 6Pcore and 10Pcore condition * fix some condition which nstreams<2 * add check for config._threads_per_stream_big Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com>
2022-11-13 01:55:54 +08:00 · 2022-11-13 01:55:54 +08:00 · 966548d061
commit 966548d061
parent bb00a9e664
3 changed files with 67 additions and 31 deletions
--- a/src/inference/dev_api/threading/ie_istreams_executor.hpp
+++ b/src/inference/dev_api/threading/ie_istreams_executor.hpp
@ -84,6 +84,7 @@ public:
        static Config MakeDefaultMultiThreaded(const Config& initial, const bool fp_intesive = true);
        static int GetDefaultNumStreams();  // no network specifics considered (only CPU's caps);
        static int GetHybridNumStreams(std::map<std::string, std::string>& config, const int stream_mode);
+        static void UpdateHybridCustomThreads(Config& config);

        std::string _name;          //!< Used by `ITT` to name executor threads
        int _streams = 1;           //!< Number of streams.
--- a/src/inference/src/threading/ie_cpu_streams_executor.cpp
+++ b/src/inference/src/threading/ie_cpu_streams_executor.cpp
@ -96,20 +96,12 @@ struct CPUStreamsExecutor::Impl {
                } else {
                    // assigning the stream to the core type in the round-robin fashion
                    // wrapping around total_streams (i.e. how many streams all different core types can handle
-                    // together)
+                    // together). Binding priority: Big core, Logical big core, Small core
                    const auto total_streams = _impl->total_streams_on_core_types.back().second;
+                    const auto big_core_streams = _impl->total_streams_on_core_types.front().second;
                    const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1;
+                    const auto phy_core_streams = big_core_streams / 2;
                    const auto streamId_wrapped = _streamId % total_streams;
-                    // This is default setting for specific CPU which Pcore is in front and Ecore is in the back
-                    const auto num_big_cores_phy = _impl->_config._small_core_offset / 2;
-                    const auto use_logic_core =
-                        _impl->_config._threads_per_stream_big * _impl->_config._big_core_streams > num_big_cores_phy;
-                    const auto phy_core_streams = _impl->_config._big_core_streams / 2;
-                    // current stream is placed on logical core
-                    const auto cur_logic_core =
-                        use_logic_core ? (streamId_wrapped >= phy_core_streams ? true : false) : false;
-                    const auto small_core_skip =
-                        _impl->_config._threads_per_stream_small == 3 && _impl->_config._small_core_streams;
                    const auto& selected_core_type =
                        std::find_if(
                            _impl->total_streams_on_core_types.cbegin(),
@ -118,31 +110,30 @@ struct CPUStreamsExecutor::Impl {
                                return p.second > streamId_wrapped;
                            })
                            ->first;
+                    const auto small_core = hybrid_core && selected_core_type == 0;
+                    const auto logic_core = !small_core && streamId_wrapped >= phy_core_streams;
+                    const auto small_core_skip = small_core && _impl->_config._threads_per_stream_small == 3 &&
+                                                 _impl->_config._small_core_streams > 1;
                    const auto max_concurrency =
-                        hybrid_core ? (selected_core_type == 0 ? _impl->_config._threads_per_stream_small
-                                                               : _impl->_config._threads_per_stream_big)
-                                    : _impl->_config._threads_per_stream_big;
+                        small_core ? _impl->_config._threads_per_stream_small : _impl->_config._threads_per_stream_big;
                    // Special handling of _threads_per_stream_small == 3
-                    const auto small_core_id = (selected_core_type == 0 && small_core_skip)
-                                                   ? 0
-                                                   : streamId_wrapped - _impl->_config._big_core_streams;
+                    const auto small_core_id = small_core_skip ? 0 : streamId_wrapped - big_core_streams;
                    const auto stream_id =
-                        hybrid_core ? (selected_core_type == 0
-                                           ? small_core_id
-                                           : (cur_logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped))
-                                    : streamId_wrapped;
-                    const auto thread_binding_step =
-                        hybrid_core ? (selected_core_type == 0 ? _impl->_config._threadBindingStep : 2)
-                                    : _impl->_config._threadBindingStep;
-                    // Special handling of _threads_per_stream_small == 3, need to skip 4
-                    const auto small_core_offset = (selected_core_type == 0 && small_core_skip)
-                                                       ? _impl->_config._small_core_offset +
-                                                             (streamId_wrapped - _impl->_config._big_core_streams) * 4
-                                                       : _impl->_config._small_core_offset;
+                        hybrid_core
+                            ? (small_core ? small_core_id
+                                          : (logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped))
+                            : streamId_wrapped;
+                    const auto thread_binding_step = hybrid_core ? (small_core ? _impl->_config._threadBindingStep : 2)
+                                                                 : _impl->_config._threadBindingStep;
+                    // Special handling of _threads_per_stream_small == 3, need to skip 4 (Four cores share one L2 cache
+                    // on the small core), stream_id = 0, cpu_idx_offset cumulative plus 4
+                    const auto small_core_offset =
+                        small_core_skip ? _impl->_config._small_core_offset + (streamId_wrapped - big_core_streams) * 4
+                                        : _impl->_config._small_core_offset;
                    const auto cpu_idx_offset =
                        hybrid_core
                            // Prevent conflicts with system scheduling, so default cpu id on big core starts from 1
-                            ? (selected_core_type == 0 ? small_core_offset : (cur_logic_core ? 0 : 1))
+                            ? (small_core ? small_core_offset : (logic_core ? 0 : 1))
                            : 0;

                    _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{}
@ -254,13 +245,25 @@ struct CPUStreamsExecutor::Impl {
 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
        if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
            const auto core_types = custom::info::core_types();
+            const auto num_core_phys = getNumberOfCPUCores();
+            const auto num_big_core_phys = getNumberOfCPUCores(true);
+            const auto num_small_core_phys = num_core_phys - num_big_core_phys;
            int sum = 0;
            // reversed order, so BIG cores are first
            for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) {
                const auto& type = *iter;
                // calculating the #streams per core type
                const int num_streams_for_core_type =
-                    type == 0 ? std::max(1, config._small_core_streams) : std::max(1, config._big_core_streams);
+                    type == 0 ? std::max(1,
+                                         std::min(config._small_core_streams,
+                                                  config._threads_per_stream_small == 0
+                                                      ? 0
+                                                      : num_small_core_phys / config._threads_per_stream_small))
+                              : std::max(1,
+                                         std::min(config._big_core_streams,
+                                                  config._threads_per_stream_big == 0
+                                                      ? 0
+                                                      : num_big_core_phys / config._threads_per_stream_big * 2));
                sum += num_streams_for_core_type;
                // prefix sum, so the core type for a given stream id will be deduced just as a upper_bound
                // (notice that the map keeps the elements in the descending order, so the big cores are populated
--- a/src/inference/src/threading/ie_istreams_executor.cpp
+++ b/src/inference/src/threading/ie_istreams_executor.cpp
@ -334,6 +334,34 @@ Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) const {
    return {};
 }

+void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) {
+    const auto num_cores = parallel_get_max_threads();
+    const auto num_cores_phys = getNumberOfCPUCores();
+    const auto num_big_cores_phys = getNumberOfCPUCores(true);
+    const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys;
+    const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys;
+    const auto threads = config._threads ? config._threads : num_cores;
+    const auto streams = config._streams > 0 ? config._streams : 1;
+
+    config._small_core_offset = num_big_cores;
+    const int threads_per_stream = std::max(1, threads / streams);
+    const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream);
+    const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream);
+    const int base_big_streams = num_cores > num_cores_phys
+                                     ? (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big * 2
+                                     : (num_big_cores_phys + threads_per_stream_big - 1) / threads_per_stream_big;
+    const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream_small : 0;
+    const int base_streams = base_big_streams + base_small_streams;
+    // big_streams = all_streams * base_big_streams / base_streams
+    config._big_core_streams = (streams * base_big_streams + base_streams - 1) / base_streams;
+    config._small_core_streams = config._streams - config._big_core_streams;
+    // _big_core_streams > 2, num_big_cores_phys must be divisible by threads_per_stream_big
+    config._threads_per_stream_big = (config._big_core_streams > 2 && num_big_cores_phys % threads_per_stream_big != 0)
+                                         ? std::min(num_big_cores_phys, num_big_cores / base_big_streams)
+                                         : threads_per_stream_big;
+    config._threads_per_stream_small = config._small_core_streams > 0 ? threads_per_stream_small : 0;
+}
+
 IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial,
                                                                            const bool fp_intesive) {
    const auto envThreads = parallel_get_env_threads();
@ -369,6 +397,10 @@ IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(cons
                custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back()));
            num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys;
        }
+        // if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here
+        if (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads) {
+            UpdateHybridCustomThreads(streamExecutorConfig);
+        }
        OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "("
                       << streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams +
                              streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams