[CPU] Change default latency mode of MTL to Pcore + Ecore (#20659)

2023-12-14 14:19:56 +08:00
parent a04d59d779
commit 5f2045909e
9 changed files with 131 additions and 11 deletions
--- a/src/inference/dev_api/ie_system_conf.h
+++ b/src/inference/dev_api/ie_system_conf.h
@@ -73,6 +73,14 @@ inline int getNumberOfLogicalCPUCores(bool bigCoresOnly = false) {
    return ov::get_number_of_logical_cpu_cores(bigCoresOnly);
 }

+/**
+ * @brief      Returns number of blocked CPU cores. Please note that this is a temporary interface for performance
+ * optimization on a specific platform. May be removed in future release.
+ * @ingroup    ov_dev_api_system_conf
+ * @return     Number of blocked CPU cores.
+ */
+using ov::get_number_of_blocked_cores;
+
 /**
 * @brief      Checks whether CPU supports SSE 4.2 capability
 * @ingroup    ie_dev_api_system_conf
--- a/src/inference/dev_api/openvino/runtime/system_conf.hpp
+++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp
@@ -61,6 +61,14 @@ OPENVINO_RUNTIME_API int get_number_of_cpu_cores(bool big_cores_only = false);
 */
 OPENVINO_RUNTIME_API int get_number_of_logical_cpu_cores(bool big_cores_only = false);

+/**
+ * @brief      Returns number of blocked CPU cores. Please note that this is a temporary interface for performance
+ * optimization on a specific platform. May be removed in future release.
+ * @ingroup    ov_dev_api_system_conf
+ * @return     Number of blocked CPU cores.
+ */
+OPENVINO_RUNTIME_API int get_number_of_blocked_cores();
+
 /**
 * @brief      Checks whether CPU supports SSE 4.2 capability
 * @ingroup    ov_dev_api_system_conf
--- a/src/inference/src/os/cpu_map_info.hpp
+++ b/src/inference/src/os/cpu_map_info.hpp
@@ -24,6 +24,7 @@ public:
    int _numa_nodes = 0;
    int _sockets = 0;
    int _cores = 0;
+    int _blocked_cores = 0;
    std::vector<std::vector<int>> _org_proc_type_table;
    std::vector<std::vector<int>> _proc_type_table;
    std::vector<std::vector<int>> _cpu_mapping_table;
@@ -134,6 +135,7 @@ void get_cpu_mapping_from_cores(const int _processors,
 * @param[out] _numa_nodes total number for nodes in system
 * @param[out] _sockets total number for sockets in system
 * @param[out] _cores total number for physical CPU cores in system
+ * @param[out] _blocked_cores total number for blocked processors in system
 * @param[out] _proc_type_table summary table of number of processors per type
 * @param[out] _cpu_mapping_table CPU mapping table for each processor
 * @return
@@ -144,6 +146,7 @@ void parse_processor_info_win(const char* base_ptr,
                              int& _numa_nodes,
                              int& _sockets,
                              int& _cores,
+                              int& _blocked_cores,
                              std::vector<std::vector<int>>& _proc_type_table,
                              std::vector<std::vector<int>>& _cpu_mapping_table);
 #endif
--- a/src/inference/src/os/win/win_system_conf.cpp
+++ b/src/inference/src/os/win/win_system_conf.cpp
@@ -35,6 +35,7 @@ CPU::CPU() {
                             _numa_nodes,
                             _sockets,
                             _cores,
+                             _blocked_cores,
                             _proc_type_table,
                             _cpu_mapping_table);
    _org_proc_type_table = _proc_type_table;
@@ -46,6 +47,7 @@ void parse_processor_info_win(const char* base_ptr,
                              int& _numa_nodes,
                              int& _sockets,
                              int& _cores,
+                              int& _blocked_cores,
                              std::vector<std::vector<int>>& _proc_type_table,
                              std::vector<std::vector<int>>& _cpu_mapping_table) {
    std::vector<int> list;
@@ -63,13 +65,13 @@ void parse_processor_info_win(const char* base_ptr,
    int group_end = 0;
    int group_id = 0;
    int group_type = 0;
-    int num_blocked = 0;

    int num_package = 0;

    _processors = 0;
    _sockets = 0;
    _cores = 0;
+    _blocked_cores = 0;

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = NULL;

@@ -144,7 +146,7 @@ void parse_processor_info_win(const char* base_ptr,
                    proc_info[CPU_MAP_GROUP_ID] = group_id;
                    if (group_id == CPU_BLOCKED) {
                        proc_info[CPU_MAP_USED_FLAG] = CPU_BLOCKED;
-                        num_blocked++;
+                        _blocked_cores++;
                    } else {
                        _proc_type_table[0][group_type]++;
                    }
@@ -183,7 +185,7 @@ void parse_processor_info_win(const char* base_ptr,
                    _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id;
                    _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED;
                }
-                num_blocked++;
+                _blocked_cores++;
            } else if (1 == list_len) {
                if ((_cpu_mapping_table.size() > list[0]) &&
                    (_cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] == -1)) {
@@ -196,9 +198,9 @@ void parse_processor_info_win(const char* base_ptr,
        }
    }
    _sockets++;
-    _processors -= num_blocked;
-    _cores -= num_blocked;
-    _proc_type_table[0][ALL_PROC] -= num_blocked;
+    _processors -= _blocked_cores;
+    _cores -= _blocked_cores;
+    _proc_type_table[0][ALL_PROC] -= _blocked_cores;
    if (_sockets > 1) {
        _proc_type_table.push_back(_proc_type_table[0]);
        _proc_type_table[0] = proc_init_line;
--- a/src/inference/src/system_conf.cpp
+++ b/src/inference/src/system_conf.cpp
@@ -191,6 +191,11 @@ std::vector<int> get_available_numa_nodes() {
 int get_number_of_logical_cpu_cores(bool) {
    return parallel_get_max_threads();
 }
+
+int get_number_of_blocked_cores() {
+    return 0;
+}
+
 std::vector<std::vector<int>> get_proc_type_table() {
    return {{-1}};
 }
@@ -238,6 +243,11 @@ int get_number_of_logical_cpu_cores(bool) {
    return parallel_get_max_threads();
 }

+int get_number_of_blocked_cores() {
+    CPU& cpu = cpu_info();
+    return cpu._blocked_cores;
+}
+
 bool is_cpu_map_available() {
    CPU& cpu = cpu_info();
    return cpu._proc_type_table.size() > 0;
@@ -444,6 +454,11 @@ int get_number_of_logical_cpu_cores(bool bigCoresOnly) {
    return logical_cores;
 }

+int get_number_of_blocked_cores() {
+    CPU& cpu = cpu_info();
+    return cpu._blocked_cores;
+}
+
 int get_org_socket_id(int socket_id) {
    CPU& cpu = cpu_info();
    auto iter = cpu._socketid_mapping_table.find(socket_id);
@@ -461,7 +476,6 @@ int get_org_numa_id(int numa_node_id) {
    }
    return -1;
 }
-
 #endif

 #if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
--- a/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp
+++ b/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp
@@ -33,6 +33,7 @@ struct WinCpuMapTestCase {
    int _numa_nodes;
    int _sockets;
    int _cores;
+    int _blocked_cores;
    std::vector<std::vector<int>> _proc_type_table;
    std::vector<std::vector<int>> _cpu_mapping_table;
    std::string system_info;
@@ -56,6 +57,7 @@ public:
        int test_numa_nodes = 0;
        int test_sockets = 0;
        int test_cores = 0;
+        int test_blocked_cores = 0;
        unsigned long len = (unsigned long)(test_len / 2);
        std::vector<std::vector<int>> test_proc_type_table;
        std::vector<std::vector<int>> test_cpu_mapping_table;
@@ -66,6 +68,7 @@ public:
                                     test_numa_nodes,
                                     test_sockets,
                                     test_cores,
+                                     test_blocked_cores,
                                     test_proc_type_table,
                                     test_cpu_mapping_table);

@@ -73,6 +76,7 @@ public:
        ASSERT_EQ(test_data._numa_nodes, test_numa_nodes);
        ASSERT_EQ(test_data._sockets, test_sockets);
        ASSERT_EQ(test_data._cores, test_cores);
+        ASSERT_EQ(test_data._blocked_cores, test_blocked_cores);
        ASSERT_EQ(test_data._proc_type_table, test_proc_type_table);
        ASSERT_EQ(test_data._cpu_mapping_table, test_cpu_mapping_table);
    }
@@ -83,6 +87,7 @@ WinCpuMapTestCase _2sockets_104cores_hyperthreading = {
    2,    // param[expected out]: total 2 numa nodes on this simulated platform
    2,    // param[expected out]: total 2 sockets on this simulated platform
    104,  // param[expected out]: total 104 CPU cores on this simulated platform
+    0,    // param[expected out]: total 0 processors on this simulated platform are blocked
    {{208, 104, 0, 104, -1, -1},
     {104, 52, 0, 52, 0, 0},
     {104, 52, 0, 52, 1, 1}},  // param[expected out]: The proc_type_table of this simulated platform
@@ -697,6 +702,7 @@ WinCpuMapTestCase _2sockets_48cores_hyperthreading = {
    2,
    2,
    48,
+    0,
    {{96, 48, 0, 48, -1, -1}, {48, 24, 0, 24, 0, 0}, {48, 24, 0, 24, 1, 1}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -987,6 +993,7 @@ WinCpuMapTestCase _2sockets_36cores_hyperthreading = {
    2,
    2,
    36,
+    0,
    {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},    {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1179,6 +1186,7 @@ WinCpuMapTestCase _2sockets_48cores = {
    2,
    2,
    48,
+    0,
    {{48, 48, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0}, {24, 24, 0, 0, 1, 1}},
    {
        {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1},    {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
@@ -1445,6 +1453,7 @@ WinCpuMapTestCase _1sockets_24cores_hyperthreading_set1 = {
    1,
    1,
    24,
+    0,
    {{32, 8, 16, 8, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},   {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1555,6 +1564,7 @@ WinCpuMapTestCase _1sockets_24cores_hyperthreading_set2 = {
    1,
    1,
    24,
+    0,
    {{32, 8, 16, 8, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},   {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1676,6 +1686,7 @@ WinCpuMapTestCase _1sockets_22cores_hyperthreading = {
    1,
    1,
    14,
+    2,
    {{20, 6, 8, 6, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},       {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1763,6 +1774,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set1 = {
    1,
    1,
    14,
+    0,
    {{20, 6, 8, 6, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},  {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1835,6 +1847,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set2 = {
    1,
    1,
    14,
+    0,
    {{20, 6, 8, 6, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},   {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1914,6 +1927,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set3 = {
    1,
    1,
    14,
+    0,
    {{20, 6, 8, 6, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},  {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1993,6 +2007,7 @@ WinCpuMapTestCase _1sockets_10cores_hyperthreading = {
    1,
    1,
    10,
+    0,
    {{12, 2, 8, 2, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2055,6 +2070,7 @@ WinCpuMapTestCase _1sockets_6cores_hyperthreading_FMT7 = {
    1,
    1,
    6,
+    0,
    {{12, 6, 0, 6, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2133,6 +2149,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading = {
    1,
    1,
    4,
+    0,
    {{8, 4, 0, 4, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2171,6 +2188,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading_1_FMT7 = {
    1,
    1,
    4,
+    0,
    {{8, 4, 0, 4, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2209,6 +2227,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading_2_FMT7 = {
    1,
    1,
    4,
+    0,
    {{8, 4, 0, 4, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2247,6 +2266,7 @@ WinCpuMapTestCase _1sockets_2cores_hyperthreading_FMT7 = {
    1,
    1,
    2,
+    0,
    {{4, 2, 0, 2, 0, 0}},
    {
        {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
--- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
+++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
@@ -15,6 +15,7 @@
 #include "openvino/runtime/threading/cpu_streams_info.hpp"
 #include "openvino/runtime/threading/istreams_executor.hpp"
 #include "performance_heuristics.hpp"
+#include "transformations/utils.hpp"

 using namespace ov;
 using namespace ov::threading;
@@ -477,13 +478,18 @@ int get_model_prefer_threads(const int num_streams,
                model_prefer = proc_type_table[0][ALL_PROC];
            }
 #else
-            bool fp_intesive = !ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model);
+            bool llm_related = has_matmul_with_compressed_weights(model);
+            bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model) || llm_related;
            const int int8_threshold = 4;  // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
            const int fp32_threshold = 2;  // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
-            // by default the latency case uses (faster) Big cores only, depending on the compute ratio
+            // By default the latency case uses (faster) Big cores only, depending on the compute ratio
+            // But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big cores
+            // only cases except LLM.
            model_prefer = proc_type_table[0][MAIN_CORE_PROC] > (proc_type_table[0][EFFICIENT_CORE_PROC] /
-                                                                 (fp_intesive ? fp32_threshold : int8_threshold))
-                               ? proc_type_table[0][MAIN_CORE_PROC]
+                                                                 (int8_intensive ? int8_threshold : fp32_threshold))
+                               ? ((!llm_related && ov::get_number_of_blocked_cores())
+                                      ? proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]
+                                      : proc_type_table[0][MAIN_CORE_PROC])
                               : proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
 #endif
        }
--- a/src/plugins/intel_cpu/src/transformations/utils.cpp
+++ b/src/plugins/intel_cpu/src/transformations/utils.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "utils.hpp"
+#include "openvino/opsets/opset1.hpp"
+#include "cpu_opset/common/op/fully_connected.hpp"
+#include "transformations/rt_info/dequantization_node.hpp"
+#include "transformations/utils/utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>& model) {
+    bool has_decompression_multiply = false;
+    auto is_decompression_multiply = [&](ov::Node* node) {
+        if (auto multiply = ov::as_type<ov::op::v1::Multiply>(node)) {
+            if (ov::is_dequantization_node(multiply->shared_from_this()))
+                has_decompression_multiply = true;
+        }
+    };
+
+    for (const auto& op : model->get_ops()) {
+        if (!ov::is_type<ov::op::v0::MatMul>(op) && !ov::is_type<FullyConnectedNode>(op))
+            continue;
+
+        if (!op->get_input_element_type(0).is_real())
+            continue;
+
+        auto weights = op->input_value(1);
+        if (!ov::op::util::is_on_constant_path(weights))
+            continue;
+
+        std::unordered_set<Node*> visited;
+        ov::op::util::visit_constant_path(weights.get_node(), visited, is_decompression_multiply);
+
+        if (has_decompression_multiply)
+            return true;
+    }
+    return false;
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/transformations/utils.hpp
+++ b/src/plugins/intel_cpu/src/transformations/utils.hpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/model.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>& model);
+
+}   // namespace intel_cpu
+}   // namespace ov