Xiaoxia/fix performance regression (#17036)
* add _streams_info_table in Executor config * change useHyperThreading init value * restore cmake * fix comments * add calling enableCpuPinning property * fix judgment about number of sockets in init_stream * fix test case compile issue * fix ci test case fail issue * modify GetPerformanceStreams calling position * add affinity in get_cpu_pinning * modify ecore judgement * add no binding core on ADL * fix ci issue, add get_num_numa_nodes() * fix code style * fix StreamsHasHigherPriority issue * fix according to comments * fix performance degression * fix code style * code style * fix warning * fix ci test failed * fix ImportNetwork issue * fix ci test case issue * fix smoke_CachingSupportCase_CPU issue * add ExportOptimalNumStreamsTest test * modify test name * modify ExportOptimalNumStreams test --------- Co-authored-by: Chen Peter <peter.chen@intel.com>
This commit is contained in:
parent
28e54e75ea
commit
6663367183
@ -173,19 +173,9 @@ using ov::set_cpu_used;
|
||||
using ov::get_proc_type_table;
|
||||
|
||||
/**
|
||||
* @brief Returns corresponding logical cores
|
||||
* @brief Get and reserve available cpu ids
|
||||
* @ingroup ie_dev_api_system_conf
|
||||
* @param[in] cpu_ids physical cores
|
||||
* @return logical cores corresponding to physical core.
|
||||
*/
|
||||
using ov::get_logical_cores;
|
||||
|
||||
/**
|
||||
* @brief Returns available cpu ids
|
||||
* @ingroup ie_dev_api_system_conf
|
||||
* @param[in] core_type core type.
|
||||
* @param[in] num_cpus number of cpus.
|
||||
* @param[in] cpu_task is cpu task, not other plugin tasks
|
||||
* @param[in] streams_info_table streams information table.
|
||||
* @return Array of available cpu ids.
|
||||
*/
|
||||
using ov::reserve_available_cpus;
|
||||
|
@ -162,14 +162,6 @@ OPENVINO_RUNTIME_API int get_num_numa_nodes();
|
||||
*/
|
||||
OPENVINO_RUNTIME_API std::vector<std::vector<int>> get_proc_type_table();
|
||||
|
||||
/**
|
||||
* @brief Returns corresponding logical cores
|
||||
* @ingroup ie_dev_api_system_conf
|
||||
* @param[in] cpu_ids physical cores
|
||||
* @return logical cores corresponding to physical core.
|
||||
*/
|
||||
OPENVINO_RUNTIME_API std::vector<int> get_logical_cores(const std::vector<int> cpu_ids);
|
||||
|
||||
/**
|
||||
* @enum ColumnOfProcessorTypeTable
|
||||
* @brief This enum contains definition of each columns in processor type table which bases on cpu core types. Will
|
||||
@ -210,17 +202,11 @@ enum ProcessorUseStatus {
|
||||
/**
|
||||
* @brief Get and reserve available cpu ids
|
||||
* @ingroup ie_dev_api_system_conf
|
||||
* @param[in] core_type core type.
|
||||
* @param[in] num_cpus number of cpus.
|
||||
* @param[in] seek_status look for CPU_MAP_USED_FLAG of seek_status in CPU mapping table
|
||||
* @param[in] reset_status reset CPU_MAP_USED_FLAG with reset_status.
|
||||
* @param[in] streams_info_table streams information table.
|
||||
* @return Array of available cpu ids.
|
||||
*/
|
||||
OPENVINO_RUNTIME_API std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
|
||||
const int num_cpus,
|
||||
const int seek_status = NOT_USED,
|
||||
const int reset_status = CPU_USED,
|
||||
const bool reserve_logic_core = false);
|
||||
OPENVINO_RUNTIME_API std::vector<std::vector<int>> reserve_available_cpus(
|
||||
const std::vector<std::vector<int>> streams_info_table);
|
||||
|
||||
/**
|
||||
* @brief Set CPU_MAP_USED_FLAG of cpu_mapping
|
||||
|
@ -105,6 +105,7 @@ public:
|
||||
std::vector<std::vector<int>> _orig_proc_type_table;
|
||||
std::vector<std::vector<int>> _proc_type_table;
|
||||
std::vector<std::vector<int>> _streams_info_table;
|
||||
std::vector<std::vector<int>> _stream_core_ids;
|
||||
std::vector<int> _stream_ids;
|
||||
bool _cpu_pinning = false;
|
||||
bool _streams_changed = false;
|
||||
|
@ -132,7 +132,7 @@ struct CPUStreamsExecutor::Impl {
|
||||
std::lock_guard<std::mutex> lock{_impl->_cpumap_mutex};
|
||||
const auto stream_id = _streamId >= _impl->_config._streams ? _impl->_config._streams - 1 : _streamId;
|
||||
const auto concurrency =
|
||||
_impl->_config._streams_info_table.size() > 0
|
||||
(_impl->_config._streams_info_table.size() > 0 && _impl->_config._stream_ids.size() > 0)
|
||||
? _impl->_config._streams_info_table[_impl->_config._stream_ids[stream_id]][THREADS_PER_STREAM]
|
||||
: 0;
|
||||
const auto cpu_core_type =
|
||||
@ -166,13 +166,10 @@ struct CPUStreamsExecutor::Impl {
|
||||
} else {
|
||||
_taskArena.reset(new custom::task_arena{concurrency});
|
||||
}
|
||||
if (_impl->_config._cpu_pinning && _streamId < _impl->_config._streams) {
|
||||
// Handle special case: reserve 4 cores when threads is 3 in ECore
|
||||
const auto num_cpus =
|
||||
cpu_core_type == EFFICIENT_CORE_PROC && concurrency == 3 && _impl->_config._small_core_streams > 1
|
||||
? concurrency + 1
|
||||
: concurrency;
|
||||
_cpu_ids = reserve_available_cpus(cpu_core_type, num_cpus, _impl->_config._plugin_task);
|
||||
if (_impl->_config._cpu_pinning) {
|
||||
_cpu_ids = static_cast<int>(_impl->_config._stream_core_ids.size()) == _impl->_config._streams
|
||||
? _impl->_config._stream_core_ids[stream_id]
|
||||
: _cpu_ids;
|
||||
if (_cpu_ids.size() > 0) {
|
||||
CpuSet processMask;
|
||||
int ncpus = 0;
|
||||
|
@ -17,11 +17,14 @@
|
||||
#include "ie_common.h"
|
||||
#include "openvino/core/visibility.hpp"
|
||||
#include "streams_executor.hpp"
|
||||
#include "threading/ie_cpu_streams_info.hpp"
|
||||
|
||||
#define XBYAK_NO_OP_NAMES
|
||||
#define XBYAK_UNDEF_JNL
|
||||
#include <xbyak/xbyak_util.h>
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace ov {
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
@ -171,15 +174,8 @@ bool is_cpu_map_available() {
|
||||
int get_num_numa_nodes() {
|
||||
return -1;
|
||||
}
|
||||
std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
|
||||
const int num_cpus,
|
||||
const int seek_status,
|
||||
const int reset_status,
|
||||
const bool reserve_logic_core) {
|
||||
return {};
|
||||
}
|
||||
std::vector<int> get_logical_cores(const std::vector<int> cpu_ids) {
|
||||
return {};
|
||||
std::vector<std::vector<int>> reserve_available_cpus(const std::vector<std::vector<int>> streams_info_table) {
|
||||
return {{-1}};
|
||||
}
|
||||
void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {}
|
||||
|
||||
@ -240,59 +236,41 @@ int get_num_numa_nodes() {
|
||||
return cpu._numa_nodes;
|
||||
}
|
||||
|
||||
std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
|
||||
const int num_cpus,
|
||||
const int seek_status,
|
||||
const int reset_status,
|
||||
const bool reserve_logic_core) {
|
||||
std::lock_guard<std::mutex> lock{cpu._cpu_mutex};
|
||||
std::vector<std::vector<int>> reserve_available_cpus(const std::vector<std::vector<int>> streams_info_table) {
|
||||
std::vector<int> cpu_ids;
|
||||
int socket = -1;
|
||||
if (reset_status >= PLUGIN_USED_START && cpu._numa_nodes > 1) {
|
||||
socket = cpu._socket_idx;
|
||||
cpu._socket_idx = (cpu._socket_idx + 1) % cpu._numa_nodes;
|
||||
}
|
||||
if (core_type < PROC_TYPE_TABLE_SIZE && core_type >= ALL_PROC) {
|
||||
for (int i = 0; i < cpu._processors; i++) {
|
||||
if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == core_type &&
|
||||
cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == seek_status &&
|
||||
(socket < 0 || (socket >= 0 && cpu._cpu_mapping_table[i][CPU_MAP_SOCKET_ID] == socket))) {
|
||||
cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
|
||||
}
|
||||
if (static_cast<int>(cpu_ids.size()) == num_cpus) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (reserve_logic_core) {
|
||||
auto logic_ids = get_logical_cores(cpu_ids);
|
||||
cpu_ids.insert(cpu_ids.end(), logic_ids.begin(), logic_ids.end());
|
||||
}
|
||||
set_cpu_used(cpu_ids, reset_status);
|
||||
} else {
|
||||
IE_THROW() << "Wrong value for core_type " << core_type;
|
||||
}
|
||||
return cpu_ids;
|
||||
}
|
||||
int info_table_size = static_cast<int>(streams_info_table.size());
|
||||
std::vector<std::vector<int>> stream_ids;
|
||||
std::vector<std::vector<std::vector<int>>> res_stream_ids;
|
||||
stream_ids.assign(info_table_size, std::vector<int>());
|
||||
res_stream_ids.assign(info_table_size, std::vector<std::vector<int>>());
|
||||
|
||||
std::vector<int> get_logical_cores(const std::vector<int> cpu_ids) {
|
||||
std::vector<int> logic_cores;
|
||||
if (cpu._proc_type_table[0][HYPER_THREADING_PROC] > 0) {
|
||||
int cpu_size = static_cast<int>(cpu_ids.size());
|
||||
for (int i = 0; i < cpu._processors; i++) {
|
||||
for (int j = 0; j < cpu_size; j++) {
|
||||
if (cpu_ids[j] >= 0 &&
|
||||
cpu._cpu_mapping_table[i][CPU_MAP_CORE_ID] == cpu._cpu_mapping_table[cpu_ids[j]][CPU_MAP_CORE_ID] &&
|
||||
cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == HYPER_THREADING_PROC) {
|
||||
logic_cores.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
|
||||
for (int i = 0; i < cpu._processors; i++) {
|
||||
for (int j = 0; j < info_table_size; j++) {
|
||||
if (static_cast<int>(res_stream_ids[j].size()) < streams_info_table[j][NUMBER_OF_STREAMS]) {
|
||||
if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == streams_info_table[j][PROC_TYPE] &&
|
||||
cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == NOT_USED) {
|
||||
stream_ids[j].push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
|
||||
cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
|
||||
}
|
||||
if (static_cast<int>(stream_ids[j].size()) == streams_info_table[j][THREADS_PER_STREAM]) {
|
||||
std::vector<int> stream_group(stream_ids[j].begin(), stream_ids[j].end());
|
||||
res_stream_ids[j].push_back(stream_group);
|
||||
stream_ids[j].clear();
|
||||
}
|
||||
}
|
||||
if (cpu_ids.size() == logic_cores.size()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
set_cpu_used(cpu_ids, CPU_USED);
|
||||
auto flatten_stream_ids =
|
||||
std::accumulate(res_stream_ids.begin(),
|
||||
res_stream_ids.end(),
|
||||
decltype(res_stream_ids)::value_type{},
|
||||
[](std::vector<std::vector<int>>& pre, std::vector<std::vector<int>>& cur) {
|
||||
pre.insert(pre.end(), cur.begin(), cur.end());
|
||||
return pre;
|
||||
});
|
||||
|
||||
return logic_cores;
|
||||
return flatten_stream_ids;
|
||||
}
|
||||
|
||||
void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {
|
||||
|
@ -283,6 +283,7 @@ void get_num_streams(const int streams,
|
||||
config.perfHintsConfig.ovPerfHintNumRequests,
|
||||
model_prefer,
|
||||
proc_type_table);
|
||||
executor_config._stream_core_ids = reserve_available_cpus(executor_config._streams_info_table);
|
||||
executor_config._threadsPerStream = executor_config._streams_info_table[0][THREADS_PER_STREAM];
|
||||
executor_config._streams = 0;
|
||||
executor_config._threads = 0;
|
||||
|
@ -273,18 +273,30 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c
|
||||
|
||||
void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr<ngraph::Function>& ngraphFunc) {
|
||||
const auto perf_hint_name = config.perfHintsConfig.ovPerfHint;
|
||||
// save hints parameters to model rt_info
|
||||
ov::AnyMap hints_props;
|
||||
std::string hint_name;
|
||||
const int latency_streams = get_num_numa_nodes();
|
||||
int streams;
|
||||
if (config.streamExecutorConfig._streams_changed) {
|
||||
streams = config.streamExecutorConfig._streams;
|
||||
} else if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
|
||||
streams = get_num_numa_nodes();
|
||||
streams = latency_streams;
|
||||
} else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
|
||||
streams = 0;
|
||||
} else {
|
||||
streams = config.streamExecutorConfig._streams == 1 ? 0 : config.streamExecutorConfig._streams;
|
||||
}
|
||||
|
||||
const auto latency_name = std::string(CONFIG_VALUE(LATENCY)) + "_" + std::string(ov::num_streams.name());
|
||||
const auto tput_name = std::string(CONFIG_VALUE(THROUGHPUT)) + "_" + std::string(ov::num_streams.name());
|
||||
|
||||
get_num_streams(streams, ngraphFunc, config);
|
||||
|
||||
hints_props.insert({latency_name, std::to_string(latency_streams)});
|
||||
hints_props.insert({tput_name, std::to_string(config.streamExecutorConfig._streams)});
|
||||
ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
|
||||
config._config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(config.streamExecutorConfig._streams);
|
||||
}
|
||||
|
||||
StreamCfg Engine::GetNumStreams(InferenceEngine::IStreamsExecutor::ThreadBindingType thread_binding_type,
|
||||
@ -780,6 +792,9 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(std::istr
|
||||
if (conf.enableDynamicBatch) {
|
||||
conf.batchLimit = static_cast<int>(cnnnetwork.getBatchSize());
|
||||
}
|
||||
if (is_cpu_map_available()) {
|
||||
get_num_streams(conf.streamExecutorConfig._streams, function, conf);
|
||||
}
|
||||
|
||||
auto execNetwork = std::make_shared<ExecNetwork>(cnnnetwork, conf, extensionManager, shared_from_this());
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
namespace {
|
||||
|
||||
class ExportImportTest : public CommonTestUtils::TestsCommon {};
|
||||
class ExportOptimalNumStreams : public ::testing::TestWithParam<std::string> {};
|
||||
|
||||
std::shared_ptr<ov::Model> MakeMatMulModel() {
|
||||
const ov::Shape input_shape = {1, 4096};
|
||||
@ -33,9 +33,9 @@ std::shared_ptr<ov::Model> MakeMatMulModel() {
|
||||
return std::make_shared<ov::Model>(results, params, "MatMulModel");
|
||||
}
|
||||
|
||||
TEST(ExportImportTest, ExportOptimalNumStreams) {
|
||||
TEST_P(ExportOptimalNumStreams, OptimalNumStreams) {
|
||||
auto original_model = MakeMatMulModel();
|
||||
std::string deviceName = "CPU";
|
||||
std::string deviceName = GetParam();
|
||||
ov::Core core;
|
||||
auto tput_mode = ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT);
|
||||
auto latency_mode = ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY);
|
||||
@ -62,4 +62,7 @@ TEST(ExportImportTest, ExportOptimalNumStreams) {
|
||||
EXPECT_EQ(nstreams_latency_original, nstreams_latency_imported);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_ExportImportTest, ExportOptimalNumStreams, ::testing::Values(std::string("CPU")));
|
||||
|
||||
} // namespace
|
||||
|
Loading…
Reference in New Issue
Block a user