diff --git a/inference-engine/include/ie_plugin_config.hpp b/inference-engine/include/ie_plugin_config.hpp index 645307b9ab4..9b2e5769af5 100644 --- a/inference-engine/include/ie_plugin_config.hpp +++ b/inference-engine/include/ie_plugin_config.hpp @@ -205,11 +205,16 @@ DECLARE_CONFIG_KEY(CPU_THREADS_NUM); * @brief The name for setting CPU affinity per thread option. * * It is passed to Core::SetConfig(), this option should be used with values: - * PluginConfigParams::YES (pinning threads to cores, best for static benchmarks), - * PluginConfigParams::NUMA (pinning threads to NUMA nodes, best for real-life, contented cases) - * this is TBB-specific knob, and the only pinning option (beyond 'NO', below) on the Windows* * PluginConfigParams::NO (no pinning for CPU inference threads) - * All settings are ignored, if the OpenVINO compiled with OpenMP threading and any affinity-related OpenMP's + * PluginConfigParams::YES, which is default on the conventional CPUs (pinning threads to cores, best for static benchmarks), + * + * the following options are implemented only for the TBB as a threading option + * PluginConfigParams::NUMA (pinning threads to NUMA nodes, best for real-life, contented cases) + * on the Windows and MacOS* this option behaves as YES + * PluginConfigParams::HYBRID_AWARE (let the runtime to do pinning to the cores types, e.g. prefer the "big" cores for latency tasks) + * on the hybrid CPUs this option is default + * + * Also, the settings are ignored, if the OpenVINO compiled with OpenMP and any affinity-related OpenMP's * environment variable is set (as affinity is configured explicitly) */ DECLARE_CONFIG_KEY(CPU_BIND_THREAD); diff --git a/inference-engine/samples/benchmark_app/README.md b/inference-engine/samples/benchmark_app/README.md index 49154897462..432e8b28ec0 100644 --- a/inference-engine/samples/benchmark_app/README.md +++ b/inference-engine/samples/benchmark_app/README.md @@ -104,14 +104,16 @@ Options: estimations the number of streams should be set to 1. -nthreads "" Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases). -enforcebf16="" Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform. - 'true' - enable bfloat16 regardless of platform support - 'false' - disable bfloat16 regardless of platform support. - -pin "YES"/"NO"/"NUMA" Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU threads pinning for CPU-involved inference. + -pin "YES"/"HYBRID_AWARE"/"NUMA"/"NO" + Optional. Explicit inference threads binding options (leave empty to let the OpenVINO to make a choice): + enabling threads->cores pinning ("YES", which is already default for a conventional CPU), + letting the runtime to decide on the threads->different core types ("HYBRID_AWARE", which is default on the hybrid CPUs) + threads->(NUMA)nodes ("NUMA") or + completely disable ("NO") CPU inference threads pinning. -ip "U8"/"FP16"/"FP32" Optional. Specifies precision for all input layers of the network. -op "U8"/"FP16"/"FP32" Optional. Specifies precision for all output layers of the network. -iop Optional. Specifies precision for input and output layers by name. Example: -iop "input:FP16, output:FP16". Notice that quotes are required. Overwrites precision from ip and op options for specified layers. - Statistics dumping options: -report_type "" Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request. -report_folder Optional. Path to a folder where statistics report is stored. diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp index 4abad5ee24f..8cc5d19c781 100644 --- a/inference-engine/samples/benchmark_app/benchmark_app.hpp +++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp @@ -73,10 +73,12 @@ static const char batch_size_message[] = "Optional. Batch size value. If not spe "Intermediate Representation."; // @brief message for CPU threads pinning option -static const char infer_threads_pinning_message[] = "Optional. Enable threads->cores (\"YES\", default), threads->(NUMA)nodes (\"NUMA\") " - "or completely disable (\"NO\") " - "CPU threads pinning for CPU-involved inference."; - +static const char infer_threads_pinning_message[] = + "Optional. Explicit inference threads binding options (leave empty to let the OpenVINO to make a choice):\n" + "\t\t\t\tenabling threads->cores pinning(\"YES\", which is already default for any conventional CPU), \n" + "\t\t\t\tletting the runtime to decide on the threads->different core types(\"HYBRID_AWARE\", which is default on the hybrid CPUs) \n" + "\t\t\t\tthreads->(NUMA)nodes(\"NUMA\") or \n" + "\t\t\t\tcompletely disable(\"NO\") CPU inference threads pinning"; // @brief message for stream_output option static const char stream_output_message[] = "Optional. Print progress as a plain text. When specified, an interactive progress bar is " "replaced with a " @@ -187,7 +189,7 @@ DEFINE_bool(enforcebf16, false, enforce_bf16_message); DEFINE_uint32(b, 0, batch_size_message); // @brief Enable plugin messages -DEFINE_string(pin, "YES", infer_threads_pinning_message); +DEFINE_string(pin, "", infer_threads_pinning_message); /// @brief Enables multiline text output instead of progress bar DEFINE_bool(stream_output, false, stream_output_message); @@ -264,7 +266,7 @@ static void showUsage() { std::cout << " -nstreams \"\" " << infer_num_streams_message << std::endl; std::cout << " -nthreads \"\" " << infer_num_threads_message << std::endl; std::cout << " -enforcebf16= " << enforce_bf16_message << std::endl; - std::cout << " -pin \"YES\"/\"NO\"/\"NUMA\" " << infer_threads_pinning_message << std::endl; + std::cout << " -pin \"YES\"/\"HYBRID_AWARE\"/\"NO\"/\"NUMA\" " << infer_threads_pinning_message << std::endl; std::cout << std::endl << " Statistics dumping options:" << std::endl; std::cout << " -report_type \"\" " << report_type_message << std::endl; std::cout << " -report_folder " << report_folder_message << std::endl; diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp index 7b540bb60f3..5a34e558cd3 100644 --- a/inference-engine/samples/benchmark_app/main.cpp +++ b/inference-engine/samples/benchmark_app/main.cpp @@ -267,9 +267,6 @@ int main(int argc, char* argv[]) { if ((device_name.find("MULTI") != std::string::npos) && (device_name.find("GPU") != std::string::npos)) { slog::warn << "Turn off threads pinning for " << device << " device since multi-scenario with GPU device is used." << slog::endl; device_config[CONFIG_KEY(CPU_BIND_THREAD)] = CONFIG_VALUE(NO); - } else { - // set to default value - device_config[CONFIG_KEY(CPU_BIND_THREAD)] = FLAGS_pin; } } diff --git a/inference-engine/src/inference_engine/ie_system_conf.cpp b/inference-engine/src/inference_engine/ie_system_conf.cpp index 8b5bbbb0e79..c4fbe597aa3 100644 --- a/inference-engine/src/inference_engine/ie_system_conf.cpp +++ b/inference-engine/src/inference_engine/ie_system_conf.cpp @@ -90,9 +90,9 @@ bool checkOpenMpEnvVars(bool includeOMPNumThreads) { #if defined(__APPLE__) // for Linux and Windows the getNumberOfCPUCores (that accounts only for physical cores) implementation is OS-specific // (see cpp files in corresponding folders), for __APPLE__ it is default : -int getNumberOfCPUCores() { return parallel_get_max_threads();} +int getNumberOfCPUCores(bool) { return parallel_get_max_threads();} #if !((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { return {0}; } +std::vector getAvailableNUMANodes() { return {-1}; } #endif #endif @@ -100,6 +100,15 @@ std::vector getAvailableNUMANodes() { return {0}; } std::vector getAvailableNUMANodes() { return custom::info::numa_nodes(); } +// this is impl only with the TBB +std::vector getAvailableCoresTypes() { + return custom::info::core_types(); +} +#else +// as the core types support exists only with the TBB, the fallback is same for any other threading API +std::vector getAvailableCoresTypes() { + return {-1}; +} #endif std::exception_ptr& CurrentException() { diff --git a/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp b/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp index c90fe681903..fd33bcf2862 100644 --- a/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp +++ b/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp @@ -7,11 +7,12 @@ #include #include #include -#include -#include "ie_system_conf.h" -#include "ie_parallel.hpp" -#include "ie_common.h" #include +#include + +#include "ie_common.h" +#include "ie_system_conf.h" +#include "threading/ie_parallel_custom_arena.hpp" namespace InferenceEngine { @@ -61,7 +62,7 @@ std::vector getAvailableNUMANodes() { return nodes; } #endif -int getNumberOfCPUCores() { +int getNumberOfCPUCores(bool bigCoresOnly) { unsigned numberOfProcessors = cpu._processors; unsigned totalNumberOfCpuCores = cpu._cores; IE_ASSERT(totalNumberOfCpuCores != 0); @@ -81,7 +82,16 @@ int getNumberOfCPUCores() { } } } - return CPU_COUNT(¤tCoreSet); + int phys_cores = CPU_COUNT(¤tCoreSet); + #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) + auto core_types = custom::info::core_types(); + if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { + phys_cores = custom::info::default_concurrency(custom::task_arena::constraints{} + .set_core_type(core_types.back()) + .set_max_threads_per_core(1)); + } + #endif + return phys_cores; } } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/os/win/win_system_conf.cpp b/inference-engine/src/inference_engine/os/win/win_system_conf.cpp index 97860492c0a..0ba0c9636d8 100644 --- a/inference-engine/src/inference_engine/os/win/win_system_conf.cpp +++ b/inference-engine/src/inference_engine/os/win/win_system_conf.cpp @@ -10,10 +10,10 @@ #include #include #include "ie_system_conf.h" -#include "ie_parallel.hpp" +#include "threading/ie_parallel_custom_arena.hpp" namespace InferenceEngine { -int getNumberOfCPUCores() { +int getNumberOfCPUCores(bool bigCoresOnly) { const int fallback_val = parallel_get_max_threads(); DWORD sz = 0; // querying the size of the resulting structure, passing the nullptr for the buffer @@ -32,12 +32,21 @@ int getNumberOfCPUCores() { offset += reinterpret_cast(ptr.get() + offset)->Size; phys_cores++; } while (offset < sz); + + #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) + auto core_types = custom::info::core_types(); + if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { + phys_cores = custom::info::default_concurrency(custom::task_arena::constraints{} + .set_core_type(core_types.back()) + .set_max_threads_per_core(1)); + } + #endif return phys_cores; } #if !(IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) // OMP/SEQ threading on the Windows doesn't support NUMA -std::vector getAvailableNUMANodes() { return std::vector(1, 0); } +std::vector getAvailableNUMANodes() { return {-1}; } #endif } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp b/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp index 5c9bdde9b61..ddd4f9cb2c4 100644 --- a/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp +++ b/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp @@ -71,19 +71,30 @@ struct CPUStreamsExecutor::Impl { ((_impl->_config._streams + _impl->_usedNumaNodes.size() - 1)/_impl->_usedNumaNodes.size())) : _impl->_usedNumaNodes.at(_streamId % _impl->_usedNumaNodes.size()); #if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO - auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic : _impl->_config._threadsPerStream; + const auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic : _impl->_config._threadsPerStream; if (ThreadBindingType::HYBRID_AWARE == _impl->_config._threadBindingType) { - _taskArena.reset(new custom::task_arena{ - custom::task_arena::constraints{} - .set_core_type(custom::info::core_types().back()) - .set_max_concurrency(concurrency) - }); + if (Config::PreferredCoreType::ROUND_ROBIN != _impl->_config._threadPreferredCoreType) { + if (Config::PreferredCoreType::ANY == _impl->_config._threadPreferredCoreType) { + _taskArena.reset(new custom::task_arena{concurrency}); + } else { + const auto selected_core_type = Config::PreferredCoreType::BIG == _impl->_config._threadPreferredCoreType + ? custom::info::core_types().back() // running on Big cores only + : custom::info::core_types().front(); // running on Little cores only + _taskArena.reset(new custom::task_arena{ + custom::task_arena::constraints{}.set_core_type(selected_core_type).set_max_concurrency(concurrency)}); + } + } else { + // assigning the stream to the core type in the round-robin fashion + // wrapping around total_streams (i.e. how many streams all different core types can handle together) + const auto total_streams = _impl->total_streams_on_core_types.back().second; + const auto streamId_wrapped = _streamId % total_streams; + const auto& selected_core_type = std::find_if(_impl->total_streams_on_core_types.cbegin(), _impl->total_streams_on_core_types.cend(), + [streamId_wrapped](const decltype(_impl->total_streams_on_core_types)::value_type & p) { return p.second > streamId_wrapped; })->first; + _taskArena.reset(new custom::task_arena{ + custom::task_arena::constraints{}.set_core_type(selected_core_type).set_max_concurrency(concurrency)}); + } } else if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { - _taskArena.reset(new custom::task_arena{ - custom::task_arena::constraints{} - .set_numa_id(_numaNodeId) - .set_max_concurrency(concurrency) - }); + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{_numaNodeId, concurrency}}); } else if ((0 != _impl->_config._threadsPerStream) || (ThreadBindingType::CORES == _impl->_config._threadBindingType)) { _taskArena.reset(new custom::task_arena{concurrency}); if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { @@ -164,6 +175,25 @@ struct CPUStreamsExecutor::Impl { } else { _usedNumaNodes = numaNodes; } + #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) + if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) { + const auto core_types = custom::info::core_types(); + const int threadsPerStream = (0 == config._threadsPerStream) ? std::thread::hardware_concurrency() : config._threadsPerStream; + int sum = 0; + // reversed order, so BIG cores are first + for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) { + const auto& type = *iter; + // calculating the #streams per core type + const int num_streams_for_core_type = std::max(1, + custom::info::default_concurrency( + custom::task_arena::constraints{}.set_core_type(type)) / threadsPerStream); + sum += num_streams_for_core_type; + // prefix sum, so the core type for a given stream id will be deduced just as a upper_bound + // (notice that the map keeps the elements in the descending order, so the big cores are populated first) + total_streams_on_core_types.push_back({type, sum}); + } + } + #endif for (auto streamId = 0; streamId < _config._streams; ++streamId) { _threads.emplace_back([this, streamId] { openvino::itt::threadName(_config._name + "_" + std::to_string(streamId)); @@ -232,6 +262,14 @@ struct CPUStreamsExecutor::Impl { bool _isStopped = false; std::vector _usedNumaNodes; ThreadLocal> _streams; + #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) + // stream id mapping to the core type + // stored in the reversed order (so the big cores, with the highest core_type_id value, are populated first) + // every entry is the core type and #streams that this AND ALL EARLIER entries can handle (prefix sum) + // (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams) + using StreamIdToCoreTypes = std::vector>; + StreamIdToCoreTypes total_streams_on_core_types; + #endif }; diff --git a/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp b/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp index 98b4dab8ecc..0393b7732f5 100644 --- a/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp +++ b/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp @@ -36,6 +36,8 @@ IStreamsExecutor::Ptr ExecutorManagerImpl::getIdleCPUStreamsExecutor(const IStre executorConfig._threadBindingType == config._threadBindingType && executorConfig._threadBindingStep == config._threadBindingStep && executorConfig._threadBindingOffset == config._threadBindingOffset) + if (executorConfig._threadBindingType != IStreamsExecutor::ThreadBindingType::HYBRID_AWARE + || executorConfig._threadPreferredCoreType == config._threadPreferredCoreType) return executor; } auto newExec = std::make_shared(config); diff --git a/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp b/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp index 96d3860001d..1a2993f3365 100644 --- a/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp +++ b/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp @@ -6,6 +6,7 @@ #include "ie_plugin_config.hpp" #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "ie_parallel.hpp" +#include "ie_parallel_custom_arena.hpp" #include "ie_system_conf.h" #include "ie_parameter.hpp" #include @@ -29,32 +30,27 @@ std::vector IStreamsExecutor::Config::SupportedKeys() { void IStreamsExecutor::Config::SetConfig(const std::string& key, const std::string& value) { if (key == CONFIG_KEY(CPU_BIND_THREAD)) { if (value == CONFIG_VALUE(YES) || value == CONFIG_VALUE(NUMA)) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) && (TBB_INTERFACE_VERSION < 11100) - if (value == CONFIG_VALUE(NUMA)) - IE_THROW() << CONFIG_KEY(CPU_BIND_THREAD) << " property value was set to NUMA. But IE was built with " - << "TBB version without NUMA-aware API. Current TBB API version is " << TBB_INTERFACE_VERSION - << ", required API version 11100 or greater."; -#endif - -#if (defined(__APPLE__) || defined(_WIN32)) - // on the Windows and Apple the CORES and NUMA pinning options are the same + #if (defined(__APPLE__) || defined(_WIN32)) _threadBindingType = IStreamsExecutor::ThreadBindingType::NUMA; -#else + #else _threadBindingType = (value == CONFIG_VALUE(YES)) ? IStreamsExecutor::ThreadBindingType::CORES : IStreamsExecutor::ThreadBindingType::NUMA; -#endif + #endif + } else if (value == CONFIG_VALUE(HYBRID_AWARE)) { + _threadBindingType = IStreamsExecutor::ThreadBindingType::HYBRID_AWARE; } else if (value == CONFIG_VALUE(NO)) { _threadBindingType = IStreamsExecutor::ThreadBindingType::NONE; } else { IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_BIND_THREAD) - << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes)"; + << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes) / " + "HYBRID_AWARE (let the runtime recognize and use the hybrid cores)"; } } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { if (value == CONFIG_VALUE(CPU_THROUGHPUT_NUMA)) { _streams = static_cast(getAvailableNUMANodes().size()); } else if (value == CONFIG_VALUE(CPU_THROUGHPUT_AUTO)) { const int sockets = static_cast(getAvailableNUMANodes().size()); - // bare minimum of streams (that evenly divides available number of core) + // bare minimum of streams (that evenly divides available number of cores) const int num_cores = sockets == 1 ? std::thread::hardware_concurrency() : getNumberOfCPUCores(); if (0 == num_cores % 4) _streams = std::max(4, num_cores / 4); @@ -138,12 +134,52 @@ Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) { return {}; } -IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial) { +IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial, const bool fp_intesive) { const auto envThreads = parallel_get_env_threads(); const auto& numaNodes = getAvailableNUMANodes(); - const auto numaNodesNum = numaNodes.size(); + const int numaNodesNum = numaNodes.size(); auto streamExecutorConfig = initial; - const auto hwCores = streamExecutorConfig._streams > 1 && numaNodesNum == 1 ? parallel_get_max_threads() : getNumberOfCPUCores(); + const bool bLatencyCase = streamExecutorConfig._streams <= numaNodesNum; + + // by default, do not use the hyper-threading (to minimize threads synch overheads) + int num_cores_default = getNumberOfCPUCores(); + #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) + //additional latency-case logic for hybrid processors: + if (ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) { + const auto core_types = custom::info::core_types(); + const auto num_little_cores = custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.front())); + const auto num_big_cores_phys = getNumberOfCPUCores(true); + const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; + const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; + // by default the latency case uses (faster) Big cores only, depending on the compute ratio + const bool bLatencyCaseBigOnly = num_big_cores_phys > (num_little_cores / (fp_intesive ? fp32_threshold : int8_threshold)); + // selecting the preferred core type + streamExecutorConfig._threadPreferredCoreType = + bLatencyCase + ? (bLatencyCaseBigOnly + ? IStreamsExecutor::Config::PreferredCoreType::BIG + : IStreamsExecutor::Config::PreferredCoreType::ANY) + : IStreamsExecutor::Config::PreferredCoreType::ROUND_ROBIN; + // additionally selecting the #cores to use in the "Big-only" case + if (bLatencyCaseBigOnly) { + const int hyper_threading_threshold = 2; // min #cores, for which the hyper-threading becomes useful for the latency case + const auto num_big_cores = custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back())); + num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys; + } + } + #endif + const auto hwCores = !bLatencyCase && numaNodesNum == 1 + // throughput case on a single-NUMA node machine uses all available cores + ? parallel_get_max_threads() + // in the rest of cases: + // multi-node machine + // or + // latency case, single-node yet hybrid case that uses + // all core types + // or + // big-cores only, but the #cores is "enough" (pls see the logic above) + // it is usually beneficial not to use the hyper-threading (which is default) + : num_cores_default; const auto threads = streamExecutorConfig._threads ? streamExecutorConfig._threads : (envThreads ? envThreads : hwCores); streamExecutorConfig._threadsPerStream = streamExecutorConfig._streams ? std::max(1, threads/streamExecutorConfig._streams) diff --git a/inference-engine/src/mkldnn_plugin/config.cpp b/inference-engine/src/mkldnn_plugin/config.cpp index 920a5ff2788..16eb4c9e130 100644 --- a/inference-engine/src/mkldnn_plugin/config.cpp +++ b/inference-engine/src/mkldnn_plugin/config.cpp @@ -13,7 +13,6 @@ #include "ie_parallel.hpp" #include "ie_system_conf.h" -#include #include namespace MKLDNNPlugin { @@ -21,16 +20,20 @@ namespace MKLDNNPlugin { using namespace InferenceEngine; Config::Config() { -#if (defined(__APPLE__) || defined(_WIN32)) -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) && (TBB_INTERFACE_VERSION >= 11100) - // If we sure that TBB has NUMA aware API part. - streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::NUMA; -#else - streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::NONE; -#endif -#else + // this is default mode streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::CORES; -#endif + + // for the TBB code-path, additional configuration depending on the OS and CPU types + #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) + #if defined(__APPLE__) || defined(_WIN32) + // 'CORES' is not implemented for Win/MacOS; so the 'NUMA' is default + streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::NUMA; + #endif + + if (getAvailableCoresTypes().size() > 1 /*Hybrid CPU*/) { + streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::HYBRID_AWARE; + } + #endif if (!with_cpu_x86_bfloat16()) enforceBF16 = false; @@ -128,7 +131,7 @@ void Config::updateProperties() { _config.insert({ PluginConfigParams::KEY_CPU_BIND_THREAD, PluginConfigParams::NUMA }); break; case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: - _config.insert({ PluginConfigParams::KEY_CPU_BIND_THREAD, PluginConfigParams::HYBRID_AWARE}); + _config.insert({ PluginConfigParams::KEY_CPU_BIND_THREAD, PluginConfigParams::HYBRID_AWARE }); break; } if (collectPerfCounters == true) diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp index f7f02a57615..156317c1d73 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp @@ -49,11 +49,11 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network, // we are cloning network if we have statistics and we can transform network. _clonedNetwork = cloneNetwork(network); + bool isFloatModel = true; if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) { // Check if network is INT8 or Binary. // BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution: // BF16 + INT8 or BF16 + BIN. - bool isFloatModel = true; CNNNetworkIterator iter(network); while (iter != CNNNetworkIterator()) { if (CaselessEq()((*iter)->type, "FakeQuantize")) { @@ -229,7 +229,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network, // special case when all InferRequests are muxed into a single queue _taskExecutor = InferenceEngine::ExecutorManager::getInstance()->getExecutor("CPU"); } else { - auto streamsExecutorConfig = InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(_cfg.streamExecutorConfig); + auto streamsExecutorConfig = InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(_cfg.streamExecutorConfig, isFloatModel); streamsExecutorConfig._name = "CPUStreamsExecutor"; _taskExecutor = InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor(streamsExecutorConfig); } diff --git a/inference-engine/src/plugin_api/ie_system_conf.h b/inference-engine/src/plugin_api/ie_system_conf.h index 981b4dda961..93d633c35f9 100644 --- a/inference-engine/src/plugin_api/ie_system_conf.h +++ b/inference-engine/src/plugin_api/ie_system_conf.h @@ -37,12 +37,23 @@ INFERENCE_ENGINE_API_CPP(bool) checkOpenMpEnvVars(bool includeOMPNumThreads = tr INFERENCE_ENGINE_API_CPP(std::vector) getAvailableNUMANodes(); /** - * @brief Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance friendly for servers) - * (on other OSes it simply relies on the original parallel API of choice, which usually uses the logical cores ) + * @brief Returns available CPU cores types (on Linux, and Windows) and ONLY with TBB, single core type is assumed otherwise * @ingroup ie_dev_api_system_conf + * @return Vector of core types + */ +INFERENCE_ENGINE_API_CPP(std::vector) getAvailableCoresTypes(); + +/** + * @brief Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance friendly for servers) + * (on other OSes it simply relies on the original parallel API of choice, which usually uses the logical cores). + * call function with 'false' to get #phys cores of all types + * call function with 'true' to get #phys 'Big' cores + * number of 'Little' = 'all' - 'Big' + * @ingroup ie_dev_api_system_conf + * @param[in] bigCoresOnly Additionally limits the number of reported cores to the 'Big' cores only. * @return Number of physical CPU cores. */ -INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(); +INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(bool bigCoresOnly = false); /** * @brief Checks whether CPU supports SSE 4.2 capability diff --git a/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp b/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp index 67a601bb772..4dd80f411bc 100644 --- a/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp +++ b/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp @@ -36,7 +36,7 @@ public: using Ptr = std::shared_ptr; /** - * @brief Defines thread binding type + * @brief Defines inference thread binding type */ enum ThreadBindingType : std::uint8_t { NONE, //!< Don't bind the inference threads @@ -74,9 +74,11 @@ public: * @brief Create appropriate multithreaded configuration * filing unconfigured values from initial configuration using hardware properties * @param initial Inital configuration + * @param fp_intesive additional hint for the the (Hybrid) core-types selection logic + * whether the executor should be configured for floating point intensive work (as opposite to int8 intensive) * @return configured values */ - static Config MakeDefaultMultiThreaded(const Config& initial); + static Config MakeDefaultMultiThreaded(const Config& initial, const bool fp_intesive = true); std::string _name; //!< Used by `ITT` to name executor threads int _streams = 1; //!< Number of streams. @@ -85,6 +87,12 @@ public: int _threadBindingStep = 1; //!< In case of @ref CORES binding offset type thread binded to cores with defined step int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores starting from offset int _threads = 0; //!< Number of threads distributed between streams. Reserved. Should not be used. + enum PreferredCoreType { + ANY, + LITTLE, + BIG, + ROUND_ROBIN // used w/multiple streams to populate the Big cores first, then the Little, then wrap around (for large #streams) + } _threadPreferredCoreType = PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize /** * @brief A constructor with arguments @@ -96,6 +104,7 @@ public: * @param[in] threadBindingStep @copybrief Config::_threadBindingStep * @param[in] threadBindingOffset @copybrief Config::_threadBindingOffset * @param[in] threads @copybrief Config::_threads + * @param[in] threadPreferBigCores @copybrief Config::_threadPreferBigCores */ Config( std::string name = "StreamsExecutor", @@ -104,14 +113,15 @@ public: ThreadBindingType threadBindingType = ThreadBindingType::NONE, int threadBindingStep = 1, int threadBindingOffset = 0, - int threads = 0) : + int threads = 0, + PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) : _name{name}, _streams{streams}, _threadsPerStream{threadsPerStream}, _threadBindingType{threadBindingType}, _threadBindingStep{threadBindingStep}, _threadBindingOffset{threadBindingOffset}, - _threads{threads} { + _threads{threads}, _threadPreferredCoreType(threadPreferredCoreType){ } }; diff --git a/tools/benchmark/main.py b/tools/benchmark/main.py index bdfe296e331..998e95638fb 100644 --- a/tools/benchmark/main.py +++ b/tools/benchmark/main.py @@ -142,9 +142,6 @@ def run(args): logger.warning(f"Turn off threads pinning for {device} " + "device since multi-scenario with GPU device is used.") config[device]['CPU_BIND_THREAD'] = 'NO' - else: - ## set to default value - config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning ## for CPU execution, more throughput-oriented execution via streams set_throughput_streams() diff --git a/tools/benchmark/parameters.py b/tools/benchmark/parameters.py index 1a4d8b84d6a..ca29fb5158c 100644 --- a/tools/benchmark/parameters.py +++ b/tools/benchmark/parameters.py @@ -91,8 +91,11 @@ def parse_args(): args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None, help='Number of threads to use for inference on the CPU, GNA ' '(including HETERO and MULTI cases).') - args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', choices=['YES', 'NO', 'NUMA'], - help='Optional. Enable threads->cores (\'YES\' is default value), threads->(NUMA)nodes (\'NUMA\') or completely disable (\'NO\')' + args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, choices=['YES', 'NO', 'NUMA', 'HYBRID_AWARE'], + help='Optional. Enable threads->cores (\'YES\' which is OpenVINO runtime\'s default for conventional CPUs), ' + 'threads->(NUMA)nodes (\'NUMA\'), ' + 'threads->appropriate core types (\'HYBRID_AWARE\', which is OpenVINO runtime\'s default for Hybrid CPUs)' + 'or completely disable (\'NO\')' 'CPU threads pinning for CPU-involved inference.') args.add_argument('-exec_graph_path', '--exec_graph_path', type=str, required=False, help='Optional. Path to a file where to store executable graph information serialized.')