Xiaoxia/fix performance regression (#17036)

* add _streams_info_table in Executor config

* change useHyperThreading init value

* restore cmake

* fix comments

* add calling enableCpuPinning property

* fix judgment about number of sockets in init_stream

* fix test case compile issue

* fix ci test case fail issue

* modify GetPerformanceStreams calling position

* add affinity in get_cpu_pinning

* modify ecore judgement

* add no binding core on ADL

* fix ci issue, add get_num_numa_nodes()

* fix code style

* fix StreamsHasHigherPriority issue

* fix according to comments

* fix performance degression

* fix code style

* code style

* fix warning

* fix ci test failed

* fix ImportNetwork issue

* fix ci test case issue

* fix smoke_CachingSupportCase_CPU issue

* add ExportOptimalNumStreamsTest test

* modify test name

* modify ExportOptimalNumStreams test

---------

Co-authored-by: Chen Peter <peter.chen@intel.com>
This commit is contained in:
Sun Xiaoxia 2023-04-25 04:35:47 +00:00 committed by GitHub
parent 28e54e75ea
commit 6663367183
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 67 additions and 96 deletions

View File

@ -173,19 +173,9 @@ using ov::set_cpu_used;
using ov::get_proc_type_table;
/**
* @brief Returns corresponding logical cores
* @brief Get and reserve available cpu ids
* @ingroup ie_dev_api_system_conf
* @param[in] cpu_ids physical cores
* @return logical cores corresponding to physical core.
*/
using ov::get_logical_cores;
/**
* @brief Returns available cpu ids
* @ingroup ie_dev_api_system_conf
* @param[in] core_type core type.
* @param[in] num_cpus number of cpus.
* @param[in] cpu_task is cpu task, not other plugin tasks
* @param[in] streams_info_table streams information table.
* @return Array of available cpu ids.
*/
using ov::reserve_available_cpus;

View File

@ -162,14 +162,6 @@ OPENVINO_RUNTIME_API int get_num_numa_nodes();
*/
OPENVINO_RUNTIME_API std::vector<std::vector<int>> get_proc_type_table();
/**
* @brief Returns corresponding logical cores
* @ingroup ie_dev_api_system_conf
* @param[in] cpu_ids physical cores
* @return logical cores corresponding to physical core.
*/
OPENVINO_RUNTIME_API std::vector<int> get_logical_cores(const std::vector<int> cpu_ids);
/**
* @enum ColumnOfProcessorTypeTable
* @brief This enum contains definition of each columns in processor type table which bases on cpu core types. Will
@ -210,17 +202,11 @@ enum ProcessorUseStatus {
/**
* @brief Get and reserve available cpu ids
* @ingroup ie_dev_api_system_conf
* @param[in] core_type core type.
* @param[in] num_cpus number of cpus.
* @param[in] seek_status look for CPU_MAP_USED_FLAG of seek_status in CPU mapping table
* @param[in] reset_status reset CPU_MAP_USED_FLAG with reset_status.
* @param[in] streams_info_table streams information table.
* @return Array of available cpu ids.
*/
OPENVINO_RUNTIME_API std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
const int num_cpus,
const int seek_status = NOT_USED,
const int reset_status = CPU_USED,
const bool reserve_logic_core = false);
OPENVINO_RUNTIME_API std::vector<std::vector<int>> reserve_available_cpus(
const std::vector<std::vector<int>> streams_info_table);
/**
* @brief Set CPU_MAP_USED_FLAG of cpu_mapping

View File

@ -105,6 +105,7 @@ public:
std::vector<std::vector<int>> _orig_proc_type_table;
std::vector<std::vector<int>> _proc_type_table;
std::vector<std::vector<int>> _streams_info_table;
std::vector<std::vector<int>> _stream_core_ids;
std::vector<int> _stream_ids;
bool _cpu_pinning = false;
bool _streams_changed = false;

View File

@ -132,7 +132,7 @@ struct CPUStreamsExecutor::Impl {
std::lock_guard<std::mutex> lock{_impl->_cpumap_mutex};
const auto stream_id = _streamId >= _impl->_config._streams ? _impl->_config._streams - 1 : _streamId;
const auto concurrency =
_impl->_config._streams_info_table.size() > 0
(_impl->_config._streams_info_table.size() > 0 && _impl->_config._stream_ids.size() > 0)
? _impl->_config._streams_info_table[_impl->_config._stream_ids[stream_id]][THREADS_PER_STREAM]
: 0;
const auto cpu_core_type =
@ -166,13 +166,10 @@ struct CPUStreamsExecutor::Impl {
} else {
_taskArena.reset(new custom::task_arena{concurrency});
}
if (_impl->_config._cpu_pinning && _streamId < _impl->_config._streams) {
// Handle special case: reserve 4 cores when threads is 3 in ECore
const auto num_cpus =
cpu_core_type == EFFICIENT_CORE_PROC && concurrency == 3 && _impl->_config._small_core_streams > 1
? concurrency + 1
: concurrency;
_cpu_ids = reserve_available_cpus(cpu_core_type, num_cpus, _impl->_config._plugin_task);
if (_impl->_config._cpu_pinning) {
_cpu_ids = static_cast<int>(_impl->_config._stream_core_ids.size()) == _impl->_config._streams
? _impl->_config._stream_core_ids[stream_id]
: _cpu_ids;
if (_cpu_ids.size() > 0) {
CpuSet processMask;
int ncpus = 0;

View File

@ -17,11 +17,14 @@
#include "ie_common.h"
#include "openvino/core/visibility.hpp"
#include "streams_executor.hpp"
#include "threading/ie_cpu_streams_info.hpp"
#define XBYAK_NO_OP_NAMES
#define XBYAK_UNDEF_JNL
#include <xbyak/xbyak_util.h>
using namespace InferenceEngine;
namespace ov {
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
@ -171,15 +174,8 @@ bool is_cpu_map_available() {
int get_num_numa_nodes() {
return -1;
}
std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
const int num_cpus,
const int seek_status,
const int reset_status,
const bool reserve_logic_core) {
return {};
}
std::vector<int> get_logical_cores(const std::vector<int> cpu_ids) {
return {};
std::vector<std::vector<int>> reserve_available_cpus(const std::vector<std::vector<int>> streams_info_table) {
return {{-1}};
}
void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {}
@ -240,59 +236,41 @@ int get_num_numa_nodes() {
return cpu._numa_nodes;
}
std::vector<int> reserve_available_cpus(const ColumnOfProcessorTypeTable core_type,
const int num_cpus,
const int seek_status,
const int reset_status,
const bool reserve_logic_core) {
std::lock_guard<std::mutex> lock{cpu._cpu_mutex};
std::vector<std::vector<int>> reserve_available_cpus(const std::vector<std::vector<int>> streams_info_table) {
std::vector<int> cpu_ids;
int socket = -1;
if (reset_status >= PLUGIN_USED_START && cpu._numa_nodes > 1) {
socket = cpu._socket_idx;
cpu._socket_idx = (cpu._socket_idx + 1) % cpu._numa_nodes;
}
if (core_type < PROC_TYPE_TABLE_SIZE && core_type >= ALL_PROC) {
for (int i = 0; i < cpu._processors; i++) {
if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == core_type &&
cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == seek_status &&
(socket < 0 || (socket >= 0 && cpu._cpu_mapping_table[i][CPU_MAP_SOCKET_ID] == socket))) {
cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
}
if (static_cast<int>(cpu_ids.size()) == num_cpus) {
break;
}
}
if (reserve_logic_core) {
auto logic_ids = get_logical_cores(cpu_ids);
cpu_ids.insert(cpu_ids.end(), logic_ids.begin(), logic_ids.end());
}
set_cpu_used(cpu_ids, reset_status);
} else {
IE_THROW() << "Wrong value for core_type " << core_type;
}
return cpu_ids;
}
int info_table_size = static_cast<int>(streams_info_table.size());
std::vector<std::vector<int>> stream_ids;
std::vector<std::vector<std::vector<int>>> res_stream_ids;
stream_ids.assign(info_table_size, std::vector<int>());
res_stream_ids.assign(info_table_size, std::vector<std::vector<int>>());
std::vector<int> get_logical_cores(const std::vector<int> cpu_ids) {
std::vector<int> logic_cores;
if (cpu._proc_type_table[0][HYPER_THREADING_PROC] > 0) {
int cpu_size = static_cast<int>(cpu_ids.size());
for (int i = 0; i < cpu._processors; i++) {
for (int j = 0; j < cpu_size; j++) {
if (cpu_ids[j] >= 0 &&
cpu._cpu_mapping_table[i][CPU_MAP_CORE_ID] == cpu._cpu_mapping_table[cpu_ids[j]][CPU_MAP_CORE_ID] &&
cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == HYPER_THREADING_PROC) {
logic_cores.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
for (int i = 0; i < cpu._processors; i++) {
for (int j = 0; j < info_table_size; j++) {
if (static_cast<int>(res_stream_ids[j].size()) < streams_info_table[j][NUMBER_OF_STREAMS]) {
if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == streams_info_table[j][PROC_TYPE] &&
cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == NOT_USED) {
stream_ids[j].push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]);
}
if (static_cast<int>(stream_ids[j].size()) == streams_info_table[j][THREADS_PER_STREAM]) {
std::vector<int> stream_group(stream_ids[j].begin(), stream_ids[j].end());
res_stream_ids[j].push_back(stream_group);
stream_ids[j].clear();
}
}
if (cpu_ids.size() == logic_cores.size()) {
break;
}
}
}
set_cpu_used(cpu_ids, CPU_USED);
auto flatten_stream_ids =
std::accumulate(res_stream_ids.begin(),
res_stream_ids.end(),
decltype(res_stream_ids)::value_type{},
[](std::vector<std::vector<int>>& pre, std::vector<std::vector<int>>& cur) {
pre.insert(pre.end(), cur.begin(), cur.end());
return pre;
});
return logic_cores;
return flatten_stream_ids;
}
void set_cpu_used(const std::vector<int>& cpu_ids, const int used) {

View File

@ -283,6 +283,7 @@ void get_num_streams(const int streams,
config.perfHintsConfig.ovPerfHintNumRequests,
model_prefer,
proc_type_table);
executor_config._stream_core_ids = reserve_available_cpus(executor_config._streams_info_table);
executor_config._threadsPerStream = executor_config._streams_info_table[0][THREADS_PER_STREAM];
executor_config._streams = 0;
executor_config._threads = 0;

View File

@ -273,18 +273,30 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c
void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr<ngraph::Function>& ngraphFunc) {
const auto perf_hint_name = config.perfHintsConfig.ovPerfHint;
// save hints parameters to model rt_info
ov::AnyMap hints_props;
std::string hint_name;
const int latency_streams = get_num_numa_nodes();
int streams;
if (config.streamExecutorConfig._streams_changed) {
streams = config.streamExecutorConfig._streams;
} else if (perf_hint_name == CONFIG_VALUE(LATENCY)) {
streams = get_num_numa_nodes();
streams = latency_streams;
} else if (perf_hint_name == CONFIG_VALUE(THROUGHPUT)) {
streams = 0;
} else {
streams = config.streamExecutorConfig._streams == 1 ? 0 : config.streamExecutorConfig._streams;
}
const auto latency_name = std::string(CONFIG_VALUE(LATENCY)) + "_" + std::string(ov::num_streams.name());
const auto tput_name = std::string(CONFIG_VALUE(THROUGHPUT)) + "_" + std::string(ov::num_streams.name());
get_num_streams(streams, ngraphFunc, config);
hints_props.insert({latency_name, std::to_string(latency_streams)});
hints_props.insert({tput_name, std::to_string(config.streamExecutorConfig._streams)});
ngraphFunc->set_rt_info(hints_props, "intel_cpu_hints_config");
config._config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(config.streamExecutorConfig._streams);
}
StreamCfg Engine::GetNumStreams(InferenceEngine::IStreamsExecutor::ThreadBindingType thread_binding_type,
@ -780,6 +792,9 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(std::istr
if (conf.enableDynamicBatch) {
conf.batchLimit = static_cast<int>(cnnnetwork.getBatchSize());
}
if (is_cpu_map_available()) {
get_num_streams(conf.streamExecutorConfig._streams, function, conf);
}
auto execNetwork = std::make_shared<ExecNetwork>(cnnnetwork, conf, extensionManager, shared_from_this());

View File

@ -15,7 +15,7 @@
namespace {
class ExportImportTest : public CommonTestUtils::TestsCommon {};
class ExportOptimalNumStreams : public ::testing::TestWithParam<std::string> {};
std::shared_ptr<ov::Model> MakeMatMulModel() {
const ov::Shape input_shape = {1, 4096};
@ -33,9 +33,9 @@ std::shared_ptr<ov::Model> MakeMatMulModel() {
return std::make_shared<ov::Model>(results, params, "MatMulModel");
}
TEST(ExportImportTest, ExportOptimalNumStreams) {
TEST_P(ExportOptimalNumStreams, OptimalNumStreams) {
auto original_model = MakeMatMulModel();
std::string deviceName = "CPU";
std::string deviceName = GetParam();
ov::Core core;
auto tput_mode = ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT);
auto latency_mode = ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY);
@ -62,4 +62,7 @@ TEST(ExportImportTest, ExportOptimalNumStreams) {
EXPECT_EQ(nstreams_latency_original, nstreams_latency_imported);
}
}
INSTANTIATE_TEST_CASE_P(smoke_ExportImportTest, ExportOptimalNumStreams, ::testing::Values(std::string("CPU")));
} // namespace