[CPU] Change default latency mode of MTL to Pcore + Ecore (#20659)
This commit is contained in:
@@ -73,6 +73,14 @@ inline int getNumberOfLogicalCPUCores(bool bigCoresOnly = false) {
|
||||
return ov::get_number_of_logical_cpu_cores(bigCoresOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Returns number of blocked CPU cores. Please note that this is a temporary interface for performance
|
||||
* optimization on a specific platform. May be removed in future release.
|
||||
* @ingroup ov_dev_api_system_conf
|
||||
* @return Number of blocked CPU cores.
|
||||
*/
|
||||
using ov::get_number_of_blocked_cores;
|
||||
|
||||
/**
|
||||
* @brief Checks whether CPU supports SSE 4.2 capability
|
||||
* @ingroup ie_dev_api_system_conf
|
||||
|
||||
@@ -61,6 +61,14 @@ OPENVINO_RUNTIME_API int get_number_of_cpu_cores(bool big_cores_only = false);
|
||||
*/
|
||||
OPENVINO_RUNTIME_API int get_number_of_logical_cpu_cores(bool big_cores_only = false);
|
||||
|
||||
/**
|
||||
* @brief Returns number of blocked CPU cores. Please note that this is a temporary interface for performance
|
||||
* optimization on a specific platform. May be removed in future release.
|
||||
* @ingroup ov_dev_api_system_conf
|
||||
* @return Number of blocked CPU cores.
|
||||
*/
|
||||
OPENVINO_RUNTIME_API int get_number_of_blocked_cores();
|
||||
|
||||
/**
|
||||
* @brief Checks whether CPU supports SSE 4.2 capability
|
||||
* @ingroup ov_dev_api_system_conf
|
||||
|
||||
@@ -24,6 +24,7 @@ public:
|
||||
int _numa_nodes = 0;
|
||||
int _sockets = 0;
|
||||
int _cores = 0;
|
||||
int _blocked_cores = 0;
|
||||
std::vector<std::vector<int>> _org_proc_type_table;
|
||||
std::vector<std::vector<int>> _proc_type_table;
|
||||
std::vector<std::vector<int>> _cpu_mapping_table;
|
||||
@@ -134,6 +135,7 @@ void get_cpu_mapping_from_cores(const int _processors,
|
||||
* @param[out] _numa_nodes total number for nodes in system
|
||||
* @param[out] _sockets total number for sockets in system
|
||||
* @param[out] _cores total number for physical CPU cores in system
|
||||
* @param[out] _blocked_cores total number for blocked processors in system
|
||||
* @param[out] _proc_type_table summary table of number of processors per type
|
||||
* @param[out] _cpu_mapping_table CPU mapping table for each processor
|
||||
* @return
|
||||
@@ -144,6 +146,7 @@ void parse_processor_info_win(const char* base_ptr,
|
||||
int& _numa_nodes,
|
||||
int& _sockets,
|
||||
int& _cores,
|
||||
int& _blocked_cores,
|
||||
std::vector<std::vector<int>>& _proc_type_table,
|
||||
std::vector<std::vector<int>>& _cpu_mapping_table);
|
||||
#endif
|
||||
|
||||
@@ -35,6 +35,7 @@ CPU::CPU() {
|
||||
_numa_nodes,
|
||||
_sockets,
|
||||
_cores,
|
||||
_blocked_cores,
|
||||
_proc_type_table,
|
||||
_cpu_mapping_table);
|
||||
_org_proc_type_table = _proc_type_table;
|
||||
@@ -46,6 +47,7 @@ void parse_processor_info_win(const char* base_ptr,
|
||||
int& _numa_nodes,
|
||||
int& _sockets,
|
||||
int& _cores,
|
||||
int& _blocked_cores,
|
||||
std::vector<std::vector<int>>& _proc_type_table,
|
||||
std::vector<std::vector<int>>& _cpu_mapping_table) {
|
||||
std::vector<int> list;
|
||||
@@ -63,13 +65,13 @@ void parse_processor_info_win(const char* base_ptr,
|
||||
int group_end = 0;
|
||||
int group_id = 0;
|
||||
int group_type = 0;
|
||||
int num_blocked = 0;
|
||||
|
||||
int num_package = 0;
|
||||
|
||||
_processors = 0;
|
||||
_sockets = 0;
|
||||
_cores = 0;
|
||||
_blocked_cores = 0;
|
||||
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = NULL;
|
||||
|
||||
@@ -144,7 +146,7 @@ void parse_processor_info_win(const char* base_ptr,
|
||||
proc_info[CPU_MAP_GROUP_ID] = group_id;
|
||||
if (group_id == CPU_BLOCKED) {
|
||||
proc_info[CPU_MAP_USED_FLAG] = CPU_BLOCKED;
|
||||
num_blocked++;
|
||||
_blocked_cores++;
|
||||
} else {
|
||||
_proc_type_table[0][group_type]++;
|
||||
}
|
||||
@@ -183,7 +185,7 @@ void parse_processor_info_win(const char* base_ptr,
|
||||
_cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id;
|
||||
_cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED;
|
||||
}
|
||||
num_blocked++;
|
||||
_blocked_cores++;
|
||||
} else if (1 == list_len) {
|
||||
if ((_cpu_mapping_table.size() > list[0]) &&
|
||||
(_cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] == -1)) {
|
||||
@@ -196,9 +198,9 @@ void parse_processor_info_win(const char* base_ptr,
|
||||
}
|
||||
}
|
||||
_sockets++;
|
||||
_processors -= num_blocked;
|
||||
_cores -= num_blocked;
|
||||
_proc_type_table[0][ALL_PROC] -= num_blocked;
|
||||
_processors -= _blocked_cores;
|
||||
_cores -= _blocked_cores;
|
||||
_proc_type_table[0][ALL_PROC] -= _blocked_cores;
|
||||
if (_sockets > 1) {
|
||||
_proc_type_table.push_back(_proc_type_table[0]);
|
||||
_proc_type_table[0] = proc_init_line;
|
||||
|
||||
@@ -191,6 +191,11 @@ std::vector<int> get_available_numa_nodes() {
|
||||
int get_number_of_logical_cpu_cores(bool) {
|
||||
return parallel_get_max_threads();
|
||||
}
|
||||
|
||||
int get_number_of_blocked_cores() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> get_proc_type_table() {
|
||||
return {{-1}};
|
||||
}
|
||||
@@ -238,6 +243,11 @@ int get_number_of_logical_cpu_cores(bool) {
|
||||
return parallel_get_max_threads();
|
||||
}
|
||||
|
||||
int get_number_of_blocked_cores() {
|
||||
CPU& cpu = cpu_info();
|
||||
return cpu._blocked_cores;
|
||||
}
|
||||
|
||||
bool is_cpu_map_available() {
|
||||
CPU& cpu = cpu_info();
|
||||
return cpu._proc_type_table.size() > 0;
|
||||
@@ -444,6 +454,11 @@ int get_number_of_logical_cpu_cores(bool bigCoresOnly) {
|
||||
return logical_cores;
|
||||
}
|
||||
|
||||
int get_number_of_blocked_cores() {
|
||||
CPU& cpu = cpu_info();
|
||||
return cpu._blocked_cores;
|
||||
}
|
||||
|
||||
int get_org_socket_id(int socket_id) {
|
||||
CPU& cpu = cpu_info();
|
||||
auto iter = cpu._socketid_mapping_table.find(socket_id);
|
||||
@@ -461,7 +476,6 @@ int get_org_numa_id(int numa_node_id) {
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))
|
||||
|
||||
@@ -33,6 +33,7 @@ struct WinCpuMapTestCase {
|
||||
int _numa_nodes;
|
||||
int _sockets;
|
||||
int _cores;
|
||||
int _blocked_cores;
|
||||
std::vector<std::vector<int>> _proc_type_table;
|
||||
std::vector<std::vector<int>> _cpu_mapping_table;
|
||||
std::string system_info;
|
||||
@@ -56,6 +57,7 @@ public:
|
||||
int test_numa_nodes = 0;
|
||||
int test_sockets = 0;
|
||||
int test_cores = 0;
|
||||
int test_blocked_cores = 0;
|
||||
unsigned long len = (unsigned long)(test_len / 2);
|
||||
std::vector<std::vector<int>> test_proc_type_table;
|
||||
std::vector<std::vector<int>> test_cpu_mapping_table;
|
||||
@@ -66,6 +68,7 @@ public:
|
||||
test_numa_nodes,
|
||||
test_sockets,
|
||||
test_cores,
|
||||
test_blocked_cores,
|
||||
test_proc_type_table,
|
||||
test_cpu_mapping_table);
|
||||
|
||||
@@ -73,6 +76,7 @@ public:
|
||||
ASSERT_EQ(test_data._numa_nodes, test_numa_nodes);
|
||||
ASSERT_EQ(test_data._sockets, test_sockets);
|
||||
ASSERT_EQ(test_data._cores, test_cores);
|
||||
ASSERT_EQ(test_data._blocked_cores, test_blocked_cores);
|
||||
ASSERT_EQ(test_data._proc_type_table, test_proc_type_table);
|
||||
ASSERT_EQ(test_data._cpu_mapping_table, test_cpu_mapping_table);
|
||||
}
|
||||
@@ -83,6 +87,7 @@ WinCpuMapTestCase _2sockets_104cores_hyperthreading = {
|
||||
2, // param[expected out]: total 2 numa nodes on this simulated platform
|
||||
2, // param[expected out]: total 2 sockets on this simulated platform
|
||||
104, // param[expected out]: total 104 CPU cores on this simulated platform
|
||||
0, // param[expected out]: total 0 processors on this simulated platform are blocked
|
||||
{{208, 104, 0, 104, -1, -1},
|
||||
{104, 52, 0, 52, 0, 0},
|
||||
{104, 52, 0, 52, 1, 1}}, // param[expected out]: The proc_type_table of this simulated platform
|
||||
@@ -697,6 +702,7 @@ WinCpuMapTestCase _2sockets_48cores_hyperthreading = {
|
||||
2,
|
||||
2,
|
||||
48,
|
||||
0,
|
||||
{{96, 48, 0, 48, -1, -1}, {48, 24, 0, 24, 0, 0}, {48, 24, 0, 24, 1, 1}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -987,6 +993,7 @@ WinCpuMapTestCase _2sockets_36cores_hyperthreading = {
|
||||
2,
|
||||
2,
|
||||
36,
|
||||
0,
|
||||
{{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1179,6 +1186,7 @@ WinCpuMapTestCase _2sockets_48cores = {
|
||||
2,
|
||||
2,
|
||||
48,
|
||||
0,
|
||||
{{48, 48, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0}, {24, 24, 0, 0, 1, 1}},
|
||||
{
|
||||
{0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
|
||||
@@ -1445,6 +1453,7 @@ WinCpuMapTestCase _1sockets_24cores_hyperthreading_set1 = {
|
||||
1,
|
||||
1,
|
||||
24,
|
||||
0,
|
||||
{{32, 8, 16, 8, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1555,6 +1564,7 @@ WinCpuMapTestCase _1sockets_24cores_hyperthreading_set2 = {
|
||||
1,
|
||||
1,
|
||||
24,
|
||||
0,
|
||||
{{32, 8, 16, 8, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1676,6 +1686,7 @@ WinCpuMapTestCase _1sockets_22cores_hyperthreading = {
|
||||
1,
|
||||
1,
|
||||
14,
|
||||
2,
|
||||
{{20, 6, 8, 6, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1763,6 +1774,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set1 = {
|
||||
1,
|
||||
1,
|
||||
14,
|
||||
0,
|
||||
{{20, 6, 8, 6, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1835,6 +1847,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set2 = {
|
||||
1,
|
||||
1,
|
||||
14,
|
||||
0,
|
||||
{{20, 6, 8, 6, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1914,6 +1927,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set3 = {
|
||||
1,
|
||||
1,
|
||||
14,
|
||||
0,
|
||||
{{20, 6, 8, 6, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
|
||||
@@ -1993,6 +2007,7 @@ WinCpuMapTestCase _1sockets_10cores_hyperthreading = {
|
||||
1,
|
||||
1,
|
||||
10,
|
||||
0,
|
||||
{{12, 2, 8, 2, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
|
||||
@@ -2055,6 +2070,7 @@ WinCpuMapTestCase _1sockets_6cores_hyperthreading_FMT7 = {
|
||||
1,
|
||||
1,
|
||||
6,
|
||||
0,
|
||||
{{12, 6, 0, 6, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
|
||||
@@ -2133,6 +2149,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading = {
|
||||
1,
|
||||
1,
|
||||
4,
|
||||
0,
|
||||
{{8, 4, 0, 4, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
|
||||
@@ -2171,6 +2188,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading_1_FMT7 = {
|
||||
1,
|
||||
1,
|
||||
4,
|
||||
0,
|
||||
{{8, 4, 0, 4, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
|
||||
@@ -2209,6 +2227,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading_2_FMT7 = {
|
||||
1,
|
||||
1,
|
||||
4,
|
||||
0,
|
||||
{{8, 4, 0, 4, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
|
||||
@@ -2247,6 +2266,7 @@ WinCpuMapTestCase _1sockets_2cores_hyperthreading_FMT7 = {
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
{{4, 2, 0, 2, 0, 0}},
|
||||
{
|
||||
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "openvino/runtime/threading/cpu_streams_info.hpp"
|
||||
#include "openvino/runtime/threading/istreams_executor.hpp"
|
||||
#include "performance_heuristics.hpp"
|
||||
#include "transformations/utils.hpp"
|
||||
|
||||
using namespace ov;
|
||||
using namespace ov::threading;
|
||||
@@ -477,13 +478,18 @@ int get_model_prefer_threads(const int num_streams,
|
||||
model_prefer = proc_type_table[0][ALL_PROC];
|
||||
}
|
||||
#else
|
||||
bool fp_intesive = !ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model);
|
||||
bool llm_related = has_matmul_with_compressed_weights(model);
|
||||
bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model) || llm_related;
|
||||
const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
|
||||
const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
|
||||
// by default the latency case uses (faster) Big cores only, depending on the compute ratio
|
||||
// By default the latency case uses (faster) Big cores only, depending on the compute ratio
|
||||
// But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big cores
|
||||
// only cases except LLM.
|
||||
model_prefer = proc_type_table[0][MAIN_CORE_PROC] > (proc_type_table[0][EFFICIENT_CORE_PROC] /
|
||||
(fp_intesive ? fp32_threshold : int8_threshold))
|
||||
? proc_type_table[0][MAIN_CORE_PROC]
|
||||
(int8_intensive ? int8_threshold : fp32_threshold))
|
||||
? ((!llm_related && ov::get_number_of_blocked_cores())
|
||||
? proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]
|
||||
: proc_type_table[0][MAIN_CORE_PROC])
|
||||
: proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
|
||||
#endif
|
||||
}
|
||||
|
||||
44
src/plugins/intel_cpu/src/transformations/utils.cpp
Normal file
44
src/plugins/intel_cpu/src/transformations/utils.cpp
Normal file
@@ -0,0 +1,44 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "utils.hpp"
|
||||
#include "openvino/opsets/opset1.hpp"
|
||||
#include "cpu_opset/common/op/fully_connected.hpp"
|
||||
#include "transformations/rt_info/dequantization_node.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>& model) {
|
||||
bool has_decompression_multiply = false;
|
||||
auto is_decompression_multiply = [&](ov::Node* node) {
|
||||
if (auto multiply = ov::as_type<ov::op::v1::Multiply>(node)) {
|
||||
if (ov::is_dequantization_node(multiply->shared_from_this()))
|
||||
has_decompression_multiply = true;
|
||||
}
|
||||
};
|
||||
|
||||
for (const auto& op : model->get_ops()) {
|
||||
if (!ov::is_type<ov::op::v0::MatMul>(op) && !ov::is_type<FullyConnectedNode>(op))
|
||||
continue;
|
||||
|
||||
if (!op->get_input_element_type(0).is_real())
|
||||
continue;
|
||||
|
||||
auto weights = op->input_value(1);
|
||||
if (!ov::op::util::is_on_constant_path(weights))
|
||||
continue;
|
||||
|
||||
std::unordered_set<Node*> visited;
|
||||
ov::op::util::visit_constant_path(weights.get_node(), visited, is_decompression_multiply);
|
||||
|
||||
if (has_decompression_multiply)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
15
src/plugins/intel_cpu/src/transformations/utils.hpp
Normal file
15
src/plugins/intel_cpu/src/transformations/utils.hpp
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/core/model.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>& model);
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
Reference in New Issue
Block a user