Hybrid-aware P/E Core optimization (#13435)
* add HybridAware stream setting and core binding * fix clang format issue * unified code style, add parameter check * correct input affinity skip NUMA, modify function name * remove unnecessary floor * fix ci compile issue on Mac/windows platform * modify smoke_SetConfigAffinity test * modify ov_core_set_and_get_property_enum test, affinity HYBRID_AWARE is changed to NUMA * remove affinity correcting on this pr * revert ov_core_test.cpp * merge function by comments * fix code style issue Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com>
This commit is contained in:
parent
086bc00d4c
commit
ec14dd3523
@ -83,6 +83,7 @@ public:
|
||||
*/
|
||||
static Config MakeDefaultMultiThreaded(const Config& initial, const bool fp_intesive = true);
|
||||
static int GetDefaultNumStreams(); // no network specifics considered (only CPU's caps);
|
||||
static int GetHybridNumStreams(const Config& config, const int stream_mode);
|
||||
|
||||
std::string _name; //!< Used by `ITT` to name executor threads
|
||||
int _streams = 1; //!< Number of streams.
|
||||
@ -91,10 +92,16 @@ public:
|
||||
//!< No binding by default
|
||||
int _threadBindingStep = 1; //!< In case of @ref CORES binding offset type
|
||||
//!< thread binded to cores with defined step
|
||||
int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores
|
||||
//!< starting from offset
|
||||
int _threads = 0; //!< Number of threads distributed between streams.
|
||||
//!< Reserved. Should not be used.
|
||||
int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores
|
||||
//!< starting from offset
|
||||
int _threads = 0; //!< Number of threads distributed between streams.
|
||||
//!< Reserved. Should not be used.
|
||||
mutable int _big_core_streams = 0; // Number of streams in Performance-core(big core)
|
||||
mutable int _small_core_streams = 0; // Number of streams in Efficient-core(small core)
|
||||
mutable int _threads_per_stream_big = 0; // Threads per stream in big cores
|
||||
mutable int _threads_per_stream_small = 0; // Threads per stream in small cores
|
||||
mutable int _small_core_offset = 0; // Calculate small core start offset when binding cpu cores
|
||||
enum StreamMode { DEFAULT, AGGRESSIVE, LESSAGGRESSIVE };
|
||||
enum PreferredCoreType {
|
||||
ANY,
|
||||
LITTLE,
|
||||
|
@ -34,23 +34,27 @@ struct CPUStreamsExecutor::Impl {
|
||||
int _ncpus = 0;
|
||||
int _threadBindingStep = 0;
|
||||
int _offset = 0;
|
||||
int _cpuIdxOffset = 0;
|
||||
Observer(custom::task_arena& arena,
|
||||
CpuSet mask,
|
||||
int ncpus,
|
||||
const int streamId,
|
||||
const int threadsPerStream,
|
||||
const int threadBindingStep,
|
||||
const int threadBindingOffset)
|
||||
const int threadBindingOffset,
|
||||
const int cpuIdxOffset = 0)
|
||||
: custom::task_scheduler_observer(arena),
|
||||
_mask{std::move(mask)},
|
||||
_ncpus(ncpus),
|
||||
_threadBindingStep(threadBindingStep),
|
||||
_offset{streamId * threadsPerStream + threadBindingOffset} {}
|
||||
_offset{streamId * threadsPerStream + threadBindingOffset},
|
||||
_cpuIdxOffset(cpuIdxOffset) {}
|
||||
void on_scheduler_entry(bool) override {
|
||||
PinThreadToVacantCore(_offset + tbb::this_task_arena::current_thread_index(),
|
||||
_threadBindingStep,
|
||||
_ncpus,
|
||||
_mask);
|
||||
_mask,
|
||||
_cpuIdxOffset);
|
||||
}
|
||||
void on_scheduler_exit(bool) override {
|
||||
PinCurrentThreadByMask(_ncpus, _mask);
|
||||
@ -103,9 +107,29 @@ struct CPUStreamsExecutor::Impl {
|
||||
return p.second > streamId_wrapped;
|
||||
})
|
||||
->first;
|
||||
_taskArena.reset(new custom::task_arena{custom::task_arena::constraints{}
|
||||
.set_core_type(selected_core_type)
|
||||
.set_max_concurrency(concurrency)});
|
||||
const auto max_concurrency = selected_core_type == 0 ? _impl->_config._threads_per_stream_small
|
||||
: _impl->_config._threads_per_stream_big;
|
||||
const auto stream_id =
|
||||
selected_core_type == 0 ? _streamId - _impl->_config._big_core_streams : _streamId;
|
||||
const auto thread_binding_step = selected_core_type == 0 ? _impl->_config._threadBindingStep : 2;
|
||||
// Prevent conflicts with system scheduling, so default cpu id on big core starts from 1
|
||||
const auto cpu_idx_offset = selected_core_type == 0 ? _impl->_config._small_core_offset : 1;
|
||||
|
||||
_taskArena.reset(new custom::task_arena{max_concurrency});
|
||||
CpuSet processMask;
|
||||
int ncpus = 0;
|
||||
std::tie(processMask, ncpus) = GetProcessMask();
|
||||
if (nullptr != processMask) {
|
||||
_observer.reset(new Observer{*_taskArena,
|
||||
std::move(processMask),
|
||||
ncpus,
|
||||
stream_id,
|
||||
max_concurrency,
|
||||
thread_binding_step,
|
||||
_impl->_config._threadBindingOffset,
|
||||
cpu_idx_offset});
|
||||
_observer->observe(true);
|
||||
}
|
||||
}
|
||||
} else if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) {
|
||||
_taskArena.reset(new custom::task_arena{custom::task_arena::constraints{_numaNodeId, concurrency}});
|
||||
@ -198,17 +222,13 @@ struct CPUStreamsExecutor::Impl {
|
||||
#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
|
||||
if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
|
||||
const auto core_types = custom::info::core_types();
|
||||
const int threadsPerStream =
|
||||
(0 == config._threadsPerStream) ? std::thread::hardware_concurrency() : config._threadsPerStream;
|
||||
int sum = 0;
|
||||
// reversed order, so BIG cores are first
|
||||
for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) {
|
||||
const auto& type = *iter;
|
||||
// calculating the #streams per core type
|
||||
const int num_streams_for_core_type =
|
||||
std::max(1,
|
||||
custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(type)) /
|
||||
threadsPerStream);
|
||||
type == 0 ? std::max(1, config._small_core_streams) : std::max(1, config._big_core_streams);
|
||||
sum += num_streams_for_core_type;
|
||||
// prefix sum, so the core type for a given stream id will be deduced just as a upper_bound
|
||||
// (notice that the map keeps the elements in the descending order, so the big cores are populated
|
||||
|
@ -46,6 +46,53 @@ int IStreamsExecutor::Config::GetDefaultNumStreams() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int IStreamsExecutor::Config::GetHybridNumStreams(const Config& config, const int stream_mode) {
|
||||
const int num_phy_cores = getNumberOfCPUCores();
|
||||
const int num_big_cores = getNumberOfCPUCores(true);
|
||||
const int num_small_cores = num_phy_cores - num_big_cores;
|
||||
|
||||
if (stream_mode == DEFAULT) {
|
||||
// bare minimum of streams (that evenly divides available number of core)
|
||||
if (0 == num_big_cores % 4) {
|
||||
config._big_core_streams = std::max(4, num_big_cores / 4);
|
||||
} else if (0 == num_big_cores % 5) {
|
||||
config._big_core_streams = std::max(5, num_big_cores / 5);
|
||||
} else if (0 == num_big_cores % 3) {
|
||||
config._big_core_streams = std::max(3, num_big_cores / 3);
|
||||
} else { // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide
|
||||
config._big_core_streams = 1;
|
||||
}
|
||||
|
||||
config._threads_per_stream_big = num_big_cores / config._big_core_streams;
|
||||
config._threads_per_stream_small = config._threads_per_stream_big * 2;
|
||||
if (num_small_cores == 0) {
|
||||
config._big_core_streams = num_big_cores / config._threads_per_stream_big;
|
||||
config._threads_per_stream_small = 0;
|
||||
} else if (num_small_cores < config._threads_per_stream_small) {
|
||||
config._small_core_streams = 1;
|
||||
config._threads_per_stream_small = num_small_cores;
|
||||
config._threads_per_stream_big = config._threads_per_stream_small / 2;
|
||||
config._big_core_streams = num_big_cores / config._threads_per_stream_big;
|
||||
} else {
|
||||
config._small_core_streams = num_small_cores / config._threads_per_stream_small;
|
||||
}
|
||||
} else if (stream_mode == AGGRESSIVE) {
|
||||
config._big_core_streams = num_big_cores;
|
||||
config._small_core_streams = num_small_cores / 2;
|
||||
config._threads_per_stream_big = num_big_cores / config._big_core_streams;
|
||||
config._threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / config._small_core_streams;
|
||||
} else if (stream_mode == LESSAGGRESSIVE) {
|
||||
config._big_core_streams = num_big_cores / 2;
|
||||
config._small_core_streams = num_small_cores / 4;
|
||||
config._threads_per_stream_big = num_big_cores / config._big_core_streams;
|
||||
config._threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / config._small_core_streams;
|
||||
} else {
|
||||
IE_THROW() << "Wrong stream mode to get num of streams: " << stream_mode;
|
||||
}
|
||||
config._small_core_offset = num_small_cores == 0 ? 0 : num_big_cores * 2;
|
||||
return config._big_core_streams + config._small_core_streams;
|
||||
}
|
||||
|
||||
void IStreamsExecutor::Config::SetConfig(const std::string& key, const std::string& value) {
|
||||
if (key == CONFIG_KEY(CPU_BIND_THREAD)) {
|
||||
if (value == CONFIG_VALUE(YES) || value == CONFIG_VALUE(NUMA)) {
|
||||
|
@ -46,14 +46,14 @@ bool PinCurrentThreadByMask(int ncores, const CpuSet& procMask) {
|
||||
return 0 == sched_setaffinity(0, CPU_ALLOC_SIZE(ncores), procMask.get());
|
||||
}
|
||||
|
||||
bool PinThreadToVacantCore(int thrIdx, int hyperthreads, int ncores, const CpuSet& procMask) {
|
||||
bool PinThreadToVacantCore(int thrIdx, int hyperthreads, int ncores, const CpuSet& procMask, int cpuIdxOffset) {
|
||||
if (procMask == nullptr)
|
||||
return false;
|
||||
const size_t size = CPU_ALLOC_SIZE(ncores);
|
||||
const int num_cpus = CPU_COUNT_S(size, procMask.get());
|
||||
thrIdx %= num_cpus; // To limit unique number in [; num_cpus-1] range
|
||||
// Place threads with specified step
|
||||
int cpu_idx = 0;
|
||||
int cpu_idx = cpuIdxOffset;
|
||||
for (int i = 0, offset = 0; i < thrIdx; ++i) {
|
||||
cpu_idx += hyperthreads;
|
||||
if (cpu_idx >= num_cpus)
|
||||
@ -61,8 +61,8 @@ bool PinThreadToVacantCore(int thrIdx, int hyperthreads, int ncores, const CpuSe
|
||||
}
|
||||
|
||||
// Find index of 'cpu_idx'-th bit that equals to 1
|
||||
int mapped_idx = -1;
|
||||
while (cpu_idx >= 0) {
|
||||
int mapped_idx = cpuIdxOffset - 1;
|
||||
while (cpu_idx >= cpuIdxOffset) {
|
||||
mapped_idx++;
|
||||
if (CPU_ISSET_S(mapped_idx, size, procMask.get()))
|
||||
--cpu_idx;
|
||||
@ -104,7 +104,7 @@ std::tuple<CpuSet, int> GetProcessMask() {
|
||||
}
|
||||
void ReleaseProcessMask(cpu_set_t*) {}
|
||||
|
||||
bool PinThreadToVacantCore(int thrIdx, int hyperthreads, int ncores, const CpuSet& procMask) {
|
||||
bool PinThreadToVacantCore(int thrIdx, int hyperthreads, int ncores, const CpuSet& procMask, int cpuIdxOffset) {
|
||||
return false;
|
||||
}
|
||||
bool PinCurrentThreadByMask(int ncores, const CpuSet& procMask) {
|
||||
|
@ -64,7 +64,7 @@ std::tuple<CpuSet, int> GetProcessMask();
|
||||
* @param[in] processMask The process mask
|
||||
* @return `True` in case of success, `false` otherwise
|
||||
*/
|
||||
bool PinThreadToVacantCore(int thrIdx, int hyperThreads, int ncores, const CpuSet& processMask);
|
||||
bool PinThreadToVacantCore(int thrIdx, int hyperThreads, int ncores, const CpuSet& processMask, int cpuIdxOffset = 0);
|
||||
|
||||
/**
|
||||
* @brief Pins thread to a spare core in the round-robin scheme, while respecting the given process mask.
|
||||
|
@ -730,20 +730,40 @@ void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, c
|
||||
// less aggressive
|
||||
const auto num_streams_less_aggressive = num_cores / 2;
|
||||
// default #streams value (most conservative)
|
||||
const auto default_num_streams = IStreamsExecutor::Config::GetDefaultNumStreams();
|
||||
const auto default_num_streams =
|
||||
engConfig.streamExecutorConfig._threadBindingType ==
|
||||
InferenceEngine::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE
|
||||
? IStreamsExecutor::Config::GetHybridNumStreams(engConfig.streamExecutorConfig,
|
||||
IStreamsExecutor::Config::StreamMode::DEFAULT)
|
||||
: IStreamsExecutor::Config::GetDefaultNumStreams();
|
||||
int num_streams = default_num_streams;
|
||||
if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
|
||||
if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL)
|
||||
|| (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) {
|
||||
// all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
|
||||
num_streams = num_cores;
|
||||
num_streams = engConfig.streamExecutorConfig._threadBindingType ==
|
||||
InferenceEngine::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE
|
||||
? IStreamsExecutor::Config::GetHybridNumStreams(
|
||||
engConfig.streamExecutorConfig,
|
||||
IStreamsExecutor::Config::StreamMode::AGGRESSIVE)
|
||||
: num_cores;
|
||||
} // otherwise (no recognized layers) falling back to the default value
|
||||
} else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
|
||||
// network is below the ISA-specific threshold
|
||||
num_streams = num_cores;
|
||||
num_streams = engConfig.streamExecutorConfig._threadBindingType ==
|
||||
InferenceEngine::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE
|
||||
? IStreamsExecutor::Config::GetHybridNumStreams(
|
||||
engConfig.streamExecutorConfig,
|
||||
IStreamsExecutor::Config::StreamMode::AGGRESSIVE)
|
||||
: num_cores;
|
||||
} else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
|
||||
// network is below general threshold
|
||||
num_streams = std::max(default_num_streams, num_streams_less_aggressive);
|
||||
num_streams = engConfig.streamExecutorConfig._threadBindingType ==
|
||||
InferenceEngine::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE
|
||||
? IStreamsExecutor::Config::GetHybridNumStreams(
|
||||
engConfig.streamExecutorConfig,
|
||||
IStreamsExecutor::Config::StreamMode::LESSAGGRESSIVE)
|
||||
: std::max(default_num_streams, num_streams_less_aggressive);
|
||||
}
|
||||
auto num_requests = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS));
|
||||
if (num_requests != config.end()) { // arrived with config to the LoadNetwork (and thus higher pri)
|
||||
|
Loading…
Reference in New Issue
Block a user