[CPU] Change default latency mode of MTL to Pcore + Ecore (#20659)

This commit is contained in:
Wanglei Shen
2023-12-14 14:19:56 +08:00
committed by GitHub
parent a04d59d779
commit 5f2045909e
9 changed files with 131 additions and 11 deletions

View File

@@ -73,6 +73,14 @@ inline int getNumberOfLogicalCPUCores(bool bigCoresOnly = false) {
return ov::get_number_of_logical_cpu_cores(bigCoresOnly);
}
/**
* @brief Returns number of blocked CPU cores. Please note that this is a temporary interface for performance
* optimization on a specific platform. May be removed in future release.
* @ingroup ov_dev_api_system_conf
* @return Number of blocked CPU cores.
*/
using ov::get_number_of_blocked_cores;
/**
* @brief Checks whether CPU supports SSE 4.2 capability
* @ingroup ie_dev_api_system_conf

View File

@@ -61,6 +61,14 @@ OPENVINO_RUNTIME_API int get_number_of_cpu_cores(bool big_cores_only = false);
*/
OPENVINO_RUNTIME_API int get_number_of_logical_cpu_cores(bool big_cores_only = false);
/**
* @brief Returns number of blocked CPU cores. Please note that this is a temporary interface for performance
* optimization on a specific platform. May be removed in future release.
* @ingroup ov_dev_api_system_conf
* @return Number of blocked CPU cores.
*/
OPENVINO_RUNTIME_API int get_number_of_blocked_cores();
/**
* @brief Checks whether CPU supports SSE 4.2 capability
* @ingroup ov_dev_api_system_conf

View File

@@ -24,6 +24,7 @@ public:
int _numa_nodes = 0;
int _sockets = 0;
int _cores = 0;
int _blocked_cores = 0;
std::vector<std::vector<int>> _org_proc_type_table;
std::vector<std::vector<int>> _proc_type_table;
std::vector<std::vector<int>> _cpu_mapping_table;
@@ -134,6 +135,7 @@ void get_cpu_mapping_from_cores(const int _processors,
* @param[out] _numa_nodes total number for nodes in system
* @param[out] _sockets total number for sockets in system
* @param[out] _cores total number for physical CPU cores in system
* @param[out] _blocked_cores total number for blocked processors in system
* @param[out] _proc_type_table summary table of number of processors per type
* @param[out] _cpu_mapping_table CPU mapping table for each processor
* @return
@@ -144,6 +146,7 @@ void parse_processor_info_win(const char* base_ptr,
int& _numa_nodes,
int& _sockets,
int& _cores,
int& _blocked_cores,
std::vector<std::vector<int>>& _proc_type_table,
std::vector<std::vector<int>>& _cpu_mapping_table);
#endif

View File

@@ -35,6 +35,7 @@ CPU::CPU() {
_numa_nodes,
_sockets,
_cores,
_blocked_cores,
_proc_type_table,
_cpu_mapping_table);
_org_proc_type_table = _proc_type_table;
@@ -46,6 +47,7 @@ void parse_processor_info_win(const char* base_ptr,
int& _numa_nodes,
int& _sockets,
int& _cores,
int& _blocked_cores,
std::vector<std::vector<int>>& _proc_type_table,
std::vector<std::vector<int>>& _cpu_mapping_table) {
std::vector<int> list;
@@ -63,13 +65,13 @@ void parse_processor_info_win(const char* base_ptr,
int group_end = 0;
int group_id = 0;
int group_type = 0;
int num_blocked = 0;
int num_package = 0;
_processors = 0;
_sockets = 0;
_cores = 0;
_blocked_cores = 0;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = NULL;
@@ -144,7 +146,7 @@ void parse_processor_info_win(const char* base_ptr,
proc_info[CPU_MAP_GROUP_ID] = group_id;
if (group_id == CPU_BLOCKED) {
proc_info[CPU_MAP_USED_FLAG] = CPU_BLOCKED;
num_blocked++;
_blocked_cores++;
} else {
_proc_type_table[0][group_type]++;
}
@@ -183,7 +185,7 @@ void parse_processor_info_win(const char* base_ptr,
_cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id;
_cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED;
}
num_blocked++;
_blocked_cores++;
} else if (1 == list_len) {
if ((_cpu_mapping_table.size() > list[0]) &&
(_cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] == -1)) {
@@ -196,9 +198,9 @@ void parse_processor_info_win(const char* base_ptr,
}
}
_sockets++;
_processors -= num_blocked;
_cores -= num_blocked;
_proc_type_table[0][ALL_PROC] -= num_blocked;
_processors -= _blocked_cores;
_cores -= _blocked_cores;
_proc_type_table[0][ALL_PROC] -= _blocked_cores;
if (_sockets > 1) {
_proc_type_table.push_back(_proc_type_table[0]);
_proc_type_table[0] = proc_init_line;

View File

@@ -191,6 +191,11 @@ std::vector<int> get_available_numa_nodes() {
int get_number_of_logical_cpu_cores(bool) {
return parallel_get_max_threads();
}
int get_number_of_blocked_cores() {
return 0;
}
std::vector<std::vector<int>> get_proc_type_table() {
return {{-1}};
}
@@ -238,6 +243,11 @@ int get_number_of_logical_cpu_cores(bool) {
return parallel_get_max_threads();
}
int get_number_of_blocked_cores() {
CPU& cpu = cpu_info();
return cpu._blocked_cores;
}
bool is_cpu_map_available() {
CPU& cpu = cpu_info();
return cpu._proc_type_table.size() > 0;
@@ -444,6 +454,11 @@ int get_number_of_logical_cpu_cores(bool bigCoresOnly) {
return logical_cores;
}
int get_number_of_blocked_cores() {
CPU& cpu = cpu_info();
return cpu._blocked_cores;
}
int get_org_socket_id(int socket_id) {
CPU& cpu = cpu_info();
auto iter = cpu._socketid_mapping_table.find(socket_id);
@@ -461,7 +476,6 @@ int get_org_numa_id(int numa_node_id) {
}
return -1;
}
#endif
#if ((OV_THREAD == OV_THREAD_TBB) || (OV_THREAD == OV_THREAD_TBB_AUTO))

View File

@@ -33,6 +33,7 @@ struct WinCpuMapTestCase {
int _numa_nodes;
int _sockets;
int _cores;
int _blocked_cores;
std::vector<std::vector<int>> _proc_type_table;
std::vector<std::vector<int>> _cpu_mapping_table;
std::string system_info;
@@ -56,6 +57,7 @@ public:
int test_numa_nodes = 0;
int test_sockets = 0;
int test_cores = 0;
int test_blocked_cores = 0;
unsigned long len = (unsigned long)(test_len / 2);
std::vector<std::vector<int>> test_proc_type_table;
std::vector<std::vector<int>> test_cpu_mapping_table;
@@ -66,6 +68,7 @@ public:
test_numa_nodes,
test_sockets,
test_cores,
test_blocked_cores,
test_proc_type_table,
test_cpu_mapping_table);
@@ -73,6 +76,7 @@ public:
ASSERT_EQ(test_data._numa_nodes, test_numa_nodes);
ASSERT_EQ(test_data._sockets, test_sockets);
ASSERT_EQ(test_data._cores, test_cores);
ASSERT_EQ(test_data._blocked_cores, test_blocked_cores);
ASSERT_EQ(test_data._proc_type_table, test_proc_type_table);
ASSERT_EQ(test_data._cpu_mapping_table, test_cpu_mapping_table);
}
@@ -83,6 +87,7 @@ WinCpuMapTestCase _2sockets_104cores_hyperthreading = {
2, // param[expected out]: total 2 numa nodes on this simulated platform
2, // param[expected out]: total 2 sockets on this simulated platform
104, // param[expected out]: total 104 CPU cores on this simulated platform
0, // param[expected out]: total 0 processors on this simulated platform are blocked
{{208, 104, 0, 104, -1, -1},
{104, 52, 0, 52, 0, 0},
{104, 52, 0, 52, 1, 1}}, // param[expected out]: The proc_type_table of this simulated platform
@@ -697,6 +702,7 @@ WinCpuMapTestCase _2sockets_48cores_hyperthreading = {
2,
2,
48,
0,
{{96, 48, 0, 48, -1, -1}, {48, 24, 0, 24, 0, 0}, {48, 24, 0, 24, 1, 1}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -987,6 +993,7 @@ WinCpuMapTestCase _2sockets_36cores_hyperthreading = {
2,
2,
36,
0,
{{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1179,6 +1186,7 @@ WinCpuMapTestCase _2sockets_48cores = {
2,
2,
48,
0,
{{48, 48, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0}, {24, 24, 0, 0, 1, 1}},
{
{0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1},
@@ -1445,6 +1453,7 @@ WinCpuMapTestCase _1sockets_24cores_hyperthreading_set1 = {
1,
1,
24,
0,
{{32, 8, 16, 8, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1555,6 +1564,7 @@ WinCpuMapTestCase _1sockets_24cores_hyperthreading_set2 = {
1,
1,
24,
0,
{{32, 8, 16, 8, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1676,6 +1686,7 @@ WinCpuMapTestCase _1sockets_22cores_hyperthreading = {
1,
1,
14,
2,
{{20, 6, 8, 6, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1763,6 +1774,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set1 = {
1,
1,
14,
0,
{{20, 6, 8, 6, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1835,6 +1847,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set2 = {
1,
1,
14,
0,
{{20, 6, 8, 6, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1914,6 +1927,7 @@ WinCpuMapTestCase _1sockets_14cores_hyperthreading_set3 = {
1,
1,
14,
0,
{{20, 6, 8, 6, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1},
@@ -1993,6 +2007,7 @@ WinCpuMapTestCase _1sockets_10cores_hyperthreading = {
1,
1,
10,
0,
{{12, 2, 8, 2, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2055,6 +2070,7 @@ WinCpuMapTestCase _1sockets_6cores_hyperthreading_FMT7 = {
1,
1,
6,
0,
{{12, 6, 0, 6, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2133,6 +2149,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading = {
1,
1,
4,
0,
{{8, 4, 0, 4, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2171,6 +2188,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading_1_FMT7 = {
1,
1,
4,
0,
{{8, 4, 0, 4, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2209,6 +2227,7 @@ WinCpuMapTestCase _1sockets_4cores_hyperthreading_2_FMT7 = {
1,
1,
4,
0,
{{8, 4, 0, 4, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},
@@ -2247,6 +2266,7 @@ WinCpuMapTestCase _1sockets_2cores_hyperthreading_FMT7 = {
1,
1,
2,
0,
{{4, 2, 0, 2, 0, 0}},
{
{0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1},

View File

@@ -15,6 +15,7 @@
#include "openvino/runtime/threading/cpu_streams_info.hpp"
#include "openvino/runtime/threading/istreams_executor.hpp"
#include "performance_heuristics.hpp"
#include "transformations/utils.hpp"
using namespace ov;
using namespace ov::threading;
@@ -477,13 +478,18 @@ int get_model_prefer_threads(const int num_streams,
model_prefer = proc_type_table[0][ALL_PROC];
}
#else
bool fp_intesive = !ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model);
bool llm_related = has_matmul_with_compressed_weights(model);
bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model) || llm_related;
const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
// by default the latency case uses (faster) Big cores only, depending on the compute ratio
// By default the latency case uses (faster) Big cores only, depending on the compute ratio
// But on MTL detected by ov::get_number_of_blocked_cores(), use Big and Little cores together in Big cores
// only cases except LLM.
model_prefer = proc_type_table[0][MAIN_CORE_PROC] > (proc_type_table[0][EFFICIENT_CORE_PROC] /
(fp_intesive ? fp32_threshold : int8_threshold))
? proc_type_table[0][MAIN_CORE_PROC]
(int8_intensive ? int8_threshold : fp32_threshold))
? ((!llm_related && ov::get_number_of_blocked_cores())
? proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC]
: proc_type_table[0][MAIN_CORE_PROC])
: proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
#endif
}

View File

@@ -0,0 +1,44 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "utils.hpp"
#include "openvino/opsets/opset1.hpp"
#include "cpu_opset/common/op/fully_connected.hpp"
#include "transformations/rt_info/dequantization_node.hpp"
#include "transformations/utils/utils.hpp"
namespace ov {
namespace intel_cpu {
bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>& model) {
bool has_decompression_multiply = false;
auto is_decompression_multiply = [&](ov::Node* node) {
if (auto multiply = ov::as_type<ov::op::v1::Multiply>(node)) {
if (ov::is_dequantization_node(multiply->shared_from_this()))
has_decompression_multiply = true;
}
};
for (const auto& op : model->get_ops()) {
if (!ov::is_type<ov::op::v0::MatMul>(op) && !ov::is_type<FullyConnectedNode>(op))
continue;
if (!op->get_input_element_type(0).is_real())
continue;
auto weights = op->input_value(1);
if (!ov::op::util::is_on_constant_path(weights))
continue;
std::unordered_set<Node*> visited;
ov::op::util::visit_constant_path(weights.get_node(), visited, is_decompression_multiply);
if (has_decompression_multiply)
return true;
}
return false;
}
} // namespace intel_cpu
} // namespace ov

View File

@@ -0,0 +1,15 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/core/model.hpp"
namespace ov {
namespace intel_cpu {
bool has_matmul_with_compressed_weights(const std::shared_ptr<const ov::Model>& model);
} // namespace intel_cpu
} // namespace ov