Xiaoxia/fix performance regression (#17036)

* add _streams_info_table in Executor config * change useHyperThreading init value * restore cmake * fix comments * add calling enableCpuPinning property * fix judgment about number of sockets in init_stream * fix test case compile issue * fix ci test case fail issue * modify GetPerformanceStreams calling position * add affinity in get_cpu_pinning * modify ecore judgement * add no binding core on ADL * fix ci issue, add get_num_numa_nodes() * fix code style * fix StreamsHasHigherPriority issue * fix according to comments * fix performance degression * fix code style * code style * fix warning * fix ci test failed * fix ImportNetwork issue * fix ci test case issue * fix smoke_CachingSupportCase_CPU issue * add ExportOptimalNumStreamsTest test * modify test name * modify ExportOptimalNumStreams test --------- Co-authored-by: Chen Peter <peter.chen@intel.com>
2023-04-25 04:35:47 +00:00 · 2023-04-25 04:35:47 +00:00 · 6663367183
commit 6663367183
parent 28e54e75ea
8 changed files with 67 additions and 96 deletions
--- a/src/inference/dev_api/ie_system_conf.h
+++ b/src/inference/dev_api/ie_system_conf.h
@ -173,19 +173,9 @@ using ov::set_cpu_used;
 using ov::get_proc_type_table;

 /**
- * @brief      Returns corresponding logical cores
+ * @brief      Get and reserve available cpu ids
 * @ingroup    ie_dev_api_system_conf
- * @param[in]  cpu_ids physical cores
- * @return     logical cores corresponding to physical core.
- */
-using ov::get_logical_cores;
-
-/**
- * @brief      Returns available cpu ids
- * @ingroup    ie_dev_api_system_conf
- * @param[in]  core_type core type.
- * @param[in]  num_cpus number of cpus.
- * @param[in]  cpu_task is cpu task, not other plugin tasks
+ * @param[in]  streams_info_table streams information table.
 * @return     Array of available cpu ids.
 */
 using ov::reserve_available_cpus;
--- a/src/inference/dev_api/openvino/runtime/system_conf.hpp
+++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp
@ -162,14 +162,6 @@ OPENVINO_RUNTIME_API int get_num_numa_nodes();
 */
 OPENVINO_RUNTIME_API std::vector<std::vector<int>> get_proc_type_table();

-/**
- * @brief      Returns corresponding logical cores
- * @ingroup    ie_dev_api_system_conf
- * @param[in]  cpu_ids physical cores
- * @return     logical cores corresponding to physical core.
- */
-OPENVINO_RUNTIME_API std::vector<int> get_logical_cores(const std::vector<int> cpu_ids);
-
 /**
 * @enum       ColumnOfProcessorTypeTable
 * @brief      This enum contains definition of each columns in processor type table which bases on cpu core types. Will
@ -210,17 +202,11 @@ enum ProcessorUseStatus {
 /**
 * @brief      Get and reserve available cpu ids
 * @ingroup    ie_dev_api_system_conf
- * @param[in]  core_type core type.
- * @param[in]  num_cpus number of cpus.
- * @param[in]  seek_status look for CPU_MAP_USED_FLAG of seek_status in CPU mapping table
- * @param[in]  reset_status reset CPU_MAP_USED_FLAG with reset_status.
+ * @param[in]  streams_info_table streams information table.
 * @return     Array of available cpu ids.
 */
-OPENVINO_RUNTIME_API std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
-                                                             const int num_cpus,
-                                                             const int seek_status = NOT_USED,
-                                                             const int reset_status = CPU_USED,
-                                                             const bool reserve_logic_core = false);
+OPENVINO_RUNTIME_API std::vector<std::vector<int>> reserve_available_cpus(
+    const std::vector<std::vector<int>> streams_info_table);

 /**
 * @brief      Set CPU_MAP_USED_FLAG of cpu_mapping
--- a/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp
+++ b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp
@ -105,6 +105,7 @@ public:
        std::vector<std::vector<int>> _orig_proc_type_table;
        std::vector<std::vector<int>> _proc_type_table;
        std::vector<std::vector<int>> _streams_info_table;
+        std::vector<std::vector<int>> _stream_core_ids;
        std::vector<int> _stream_ids;
        bool _cpu_pinning = false;
        bool _streams_changed = false;
--- a/src/inference/src/dev/threading/cpu_streams_executor.cpp
+++ b/src/inference/src/dev/threading/cpu_streams_executor.cpp
@ -132,7 +132,7 @@ struct CPUStreamsExecutor::Impl {
            std::lock_guard<std::mutex> lock{_impl->_cpumap_mutex};
            const auto stream_id = _streamId >= _impl->_config._streams ? _impl->_config._streams - 1 : _streamId;
            const auto concurrency =
-                _impl->_config._streams_info_table.size() > 0
+                (_impl->_config._streams_info_table.size() > 0 && _impl->_config._stream_ids.size() > 0)
                    ? _impl->_config._streams_info_table[_impl->_config._stream_ids[stream_id]][THREADS_PER_STREAM]
                    : 0;
            const auto cpu_core_type =
@ -166,13 +166,10 @@ struct CPUStreamsExecutor::Impl {
            } else {
                _taskArena.reset(new custom::task_arena{concurrency});
            }
-            if (_impl->_config._cpu_pinning && _streamId < _impl->_config._streams) {
-                // Handle special case: reserve 4 cores when threads is 3 in ECore
-                const auto num_cpus =
-                    cpu_core_type == EFFICIENT_CORE_PROC && concurrency == 3 && _impl->_config._small_core_streams > 1
-                        ? concurrency + 1
-                        : concurrency;
-                _cpu_ids = reserve_available_cpus(cpu_core_type, num_cpus, _impl->_config._plugin_task);
+            if (_impl->_config._cpu_pinning) {
+                _cpu_ids = static_cast<int>(_impl->_config._stream_core_ids.size()) == _impl->_config._streams
+                               ? _impl->_config._stream_core_ids[stream_id]
+                               : _cpu_ids;
                if (_cpu_ids.size() > 0) {
                    CpuSet processMask;
                    int ncpus = 0;
--- a/src/inference/src/system_conf.cpp
+++ b/src/inference/src/system_conf.cpp
@ -17,11 +17,14 @@
 #include "ie_common.h"
 #include "openvino/core/visibility.hpp"
 #include "streams_executor.hpp"
+#include "threading/ie_cpu_streams_info.hpp"

 #define XBYAK_NO_OP_NAMES
 #define XBYAK_UNDEF_JNL
 #include <xbyak/xbyak_util.h>

+using namespace InferenceEngine;
+
 namespace ov {

 #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
@ -171,15 +174,8 @@ bool is_cpu_map_available() {
 int get_num_numa_nodes() {
    return -1;
 }
-std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
-                                        const int num_cpus,
-                                        const int seek_status,
-                                        const int reset_status,
-                                        const bool reserve_logic_core) {
-    return {};
-}
-std::vector<int> get_logical_cores(const std::vector<int> cpu_ids) {
-    return {};
+std::vector<std::vector<int>> reserve_available_cpus(const std::vector<std::vector<int>> streams_info_table) {
+    return {{-1}};
 }
 void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {}

@ -240,59 +236,41 @@ int get_num_numa_nodes() {
    return cpu._numa_nodes;
 }

-std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
-                                        const int num_cpus,
-                                        const int seek_status,
-                                        const int reset_status,
-                                        const bool reserve_logic_core) {
-    std::lock_guard<std::mutex> lock{cpu._cpu_mutex};
+std::vector<std::vector<int>> reserve_available_cpus(const std::vector<std::vector<int>> streams_info_table) {
    std::vector<int> cpu_ids;
-    int socket = -1;
-    if (reset_status >= PLUGIN_USED_START && cpu._numa_nodes > 1) {
-        socket = cpu._socket_idx;
-        cpu._socket_idx = (cpu._socket_idx + 1) % cpu._numa_nodes;
-    }
-    if (core_type < PROC_TYPE_TABLE_SIZE && core_type >= ALL_PROC) {
-        for (int i = 0; i < cpu._processors; i++) {
-            if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == core_type &&
-                cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == seek_status &&
-                (socket < 0 || (socket >= 0 && cpu._cpu_mapping_table[i][CPU_MAP_SOCKET_ID] == socket))) {
-                cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
-            }
-            if (static_cast<int>(cpu_ids.size()) == num_cpus) {
-                break;
-            }
-        }
-        if (reserve_logic_core) {
-            auto logic_ids = get_logical_cores(cpu_ids);
-            cpu_ids.insert(cpu_ids.end(), logic_ids.begin(), logic_ids.end());
-        }
-        set_cpu_used(cpu_ids, reset_status);
-    } else {
-        IE_THROW() << "Wrong value for core_type " << core_type;
-    }
-    return cpu_ids;
-}
+    int info_table_size = static_cast<int>(streams_info_table.size());
+    std::vector<std::vector<int>> stream_ids;
+    std::vector<std::vector<std::vector<int>>> res_stream_ids;
+    stream_ids.assign(info_table_size, std::vector<int>());
+    res_stream_ids.assign(info_table_size, std::vector<std::vector<int>>());

-std::vector<int> get_logical_cores(const std::vector<int> cpu_ids) {
-    std::vector<int> logic_cores;
-    if (cpu._proc_type_table[0][HYPER_THREADING_PROC] > 0) {
-        int cpu_size = static_cast<int>(cpu_ids.size());
-        for (int i = 0; i < cpu._processors; i++) {
-            for (int j = 0; j < cpu_size; j++) {
-                if (cpu_ids[j] >= 0 &&
-                    cpu._cpu_mapping_table[i][CPU_MAP_CORE_ID] == cpu._cpu_mapping_table[cpu_ids[j]][CPU_MAP_CORE_ID] &&
-                    cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == HYPER_THREADING_PROC) {
-                    logic_cores.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
+    for (int i = 0; i < cpu._processors; i++) {
+        for (int j = 0; j < info_table_size; j++) {
+            if (static_cast<int>(res_stream_ids[j].size()) < streams_info_table[j][NUMBER_OF_STREAMS]) {
+                if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == streams_info_table[j][PROC_TYPE] &&
+                    cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == NOT_USED) {
+                    stream_ids[j].push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
+                    cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
+                }
+                if (static_cast<int>(stream_ids[j].size()) == streams_info_table[j][THREADS_PER_STREAM]) {
+                    std::vector<int> stream_group(stream_ids[j].begin(), stream_ids[j].end());
+                    res_stream_ids[j].push_back(stream_group);
+                    stream_ids[j].clear();
                }
            }
-            if (cpu_ids.size() == logic_cores.size()) {
-                break;
-            }
        }
    }
+    set_cpu_used(cpu_ids, CPU_USED);
+    auto flatten_stream_ids =
+        std::accumulate(res_stream_ids.begin(),
+                        res_stream_ids.end(),
+                        decltype(res_stream_ids)::value_type{},
+                        [](std::vector<std::vector<int>>& pre, std::vector<std::vector<int>>& cur) {
+                            pre.insert(pre.end(), cur.begin(), cur.end());
+                            return pre;
+                        });

-    return logic_cores;
+    return flatten_stream_ids;
 }

 void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {
--- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
+++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
@ -283,6 +283,7 @@ void get_num_streams(const int streams,
                                                                 config.perfHintsConfig.ovPerfHintNumRequests,
                                                                 model_prefer,
                                                                 proc_type_table);
+    executor_config._stream_core_ids = reserve_available_cpus(executor_config._streams_info_table);
    executor_config._threadsPerStream = executor_config._streams_info_table[0][THREADS_PER_STREAM];
    executor_config._streams = 0;
    executor_config._threads = 0;
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@ -273,18 +273,30 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c

 void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr<ngraph::Function>& ngraphFunc) {
    const auto perf_hint_name = config.perfHintsConfig.ovPerfHint;
+    // save hints parameters to model rt_info
+    ov::AnyMap hints_props;
+    std::string hint_name;
+    const int latency_streams = get_num_numa_nodes();
    int streams;
    if (config.streamExecutorConfig._streams_changed) {
        streams = config.streamExecutorConfig._streams;
    } else if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
-        streams = get_num_numa_nodes();
+        streams = latency_streams;
    } else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
        streams = 0;
    } else {
        streams = config.streamExecutorConfig._streams == 1 ? 0 : config.streamExecutorConfig._streams;
    }

+    const auto latency_name = std::string(CONFIG_VALUE(LATENCY)) + "_" + std::string(ov::num_streams.name());
+    const auto tput_name = std::string(CONFIG_VALUE(THROUGHPUT)) + "_" + std::string(ov::num_streams.name());
+
    get_num_streams(streams, ngraphFunc, config);
+
+    hints_props.insert({latency_name, std::to_string(latency_streams)});
+    hints_props.insert({tput_name, std::to_string(config.streamExecutorConfig._streams)});
+    ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
+    config._config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(config.streamExecutorConfig._streams);
 }

 StreamCfg Engine::GetNumStreams(InferenceEngine::IStreamsExecutor::ThreadBindingType thread_binding_type,
@ -780,6 +792,9 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(std::istr
    if (conf.enableDynamicBatch) {
        conf.batchLimit = static_cast<int>(cnnnetwork.getBatchSize());
    }
+    if (is_cpu_map_available()) {
+        get_num_streams(conf.streamExecutorConfig._streams, function, conf);
+    }

    auto execNetwork = std::make_shared<ExecNetwork>(cnnnetwork, conf, extensionManager, shared_from_this());

--- a/src/plugins/intel_cpu/tests/functional/behavior/export_import.cpp
+++ b/src/plugins/intel_cpu/tests/functional/behavior/export_import.cpp
@ -15,7 +15,7 @@

 namespace {

-class ExportImportTest : public CommonTestUtils::TestsCommon {};
+class ExportOptimalNumStreams : public ::testing::TestWithParam<std::string> {};

 std::shared_ptr<ov::Model> MakeMatMulModel() {
    const ov::Shape input_shape = {1, 4096};
@ -33,9 +33,9 @@ std::shared_ptr<ov::Model> MakeMatMulModel() {
    return std::make_shared<ov::Model>(results, params, "MatMulModel");
 }

-TEST(ExportImportTest, ExportOptimalNumStreams) {
+TEST_P(ExportOptimalNumStreams, OptimalNumStreams) {
    auto original_model = MakeMatMulModel();
-    std::string deviceName = "CPU";
+    std::string deviceName = GetParam();
    ov::Core core;
    auto tput_mode = ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT);
    auto latency_mode = ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY);
@ -62,4 +62,7 @@ TEST(ExportImportTest, ExportOptimalNumStreams) {
        EXPECT_EQ(nstreams_latency_original, nstreams_latency_imported);
    }
 }
+
+INSTANTIATE_TEST_CASE_P(smoke_ExportImportTest, ExportOptimalNumStreams, ::testing::Values(std::string("CPU")));
+
 }  // namespace