diff --git a/inference-engine/include/ie_plugin_config.hpp b/inference-engine/include/ie_plugin_config.hpp
index 645307b9ab4..9b2e5769af5 100644
--- a/inference-engine/include/ie_plugin_config.hpp
+++ b/inference-engine/include/ie_plugin_config.hpp
@@ -205,11 +205,16 @@ DECLARE_CONFIG_KEY(CPU_THREADS_NUM);
  * @brief The name for setting CPU affinity per thread option.
  *
  * It is passed to Core::SetConfig(), this option should be used with values:
- * PluginConfigParams::YES (pinning threads to cores, best for static benchmarks),
- * PluginConfigParams::NUMA (pinning threads to NUMA nodes, best for real-life, contented cases)
- * this is TBB-specific knob, and the only pinning option (beyond 'NO', below) on the Windows*
  * PluginConfigParams::NO (no pinning for CPU inference threads)
- * All settings are ignored, if the OpenVINO compiled with OpenMP threading and any affinity-related OpenMP's
+ * PluginConfigParams::YES, which is default on the conventional CPUs (pinning threads to cores, best for static benchmarks),
+ *
+ * the following options are implemented only for the TBB as a threading option
+ * PluginConfigParams::NUMA (pinning threads to NUMA nodes, best for real-life, contented cases)
+ *      on the Windows and MacOS* this option behaves as YES
+ * PluginConfigParams::HYBRID_AWARE (let the runtime to do pinning to the cores types, e.g. prefer the "big" cores for latency tasks)
+ *      on the hybrid CPUs this option is default
+ *
+ * Also, the settings are ignored, if the OpenVINO compiled with OpenMP and any affinity-related OpenMP's
  * environment variable is set (as affinity is configured explicitly)
  */
 DECLARE_CONFIG_KEY(CPU_BIND_THREAD);
diff --git a/inference-engine/samples/benchmark_app/README.md b/inference-engine/samples/benchmark_app/README.md
index 49154897462..432e8b28ec0 100644
--- a/inference-engine/samples/benchmark_app/README.md
+++ b/inference-engine/samples/benchmark_app/README.md
@@ -104,14 +104,16 @@ Options:
                                 estimations the number of streams should be set to 1.
     -nthreads "<integer>"       Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases).
     -enforcebf16="<true/false>" Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform.
-                                        'true'  - enable  bfloat16 regardless of platform support
-                                        'false' - disable bfloat16 regardless of platform support.
-    -pin "YES"/"NO"/"NUMA"      Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU threads pinning for CPU-involved inference.
+    -pin "YES"/"HYBRID_AWARE"/"NUMA"/"NO"
+                                Optional. Explicit inference threads binding options (leave empty to let the OpenVINO to make a choice):
+					            enabling threads->cores pinning ("YES", which is already default for a conventional CPU),  
+			                    letting the runtime to decide on the threads->different core types ("HYBRID_AWARE", which is default on the hybrid CPUs)
+			                    threads->(NUMA)nodes ("NUMA") or 
+			      	            completely disable ("NO") CPU inference threads pinning.
     -ip "U8"/"FP16"/"FP32"      Optional. Specifies precision for all input layers of the network.
     -op "U8"/"FP16"/"FP32"      Optional. Specifies precision for all output layers of the network.
     -iop                        Optional. Specifies precision for input and output layers by name. Example: -iop "input:FP16, output:FP16". Notice that quotes are required. Overwrites precision from ip and op options for specified layers.
 
-
   Statistics dumping options:
     -report_type "<type>"       Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request.
     -report_folder              Optional. Path to a folder where statistics report is stored.
diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp
index 4abad5ee24f..8cc5d19c781 100644
--- a/inference-engine/samples/benchmark_app/benchmark_app.hpp
+++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@@ -73,10 +73,12 @@ static const char batch_size_message[] = "Optional. Batch size value. If not spe
                                          "Intermediate Representation.";
 
 // @brief message for CPU threads pinning option
-static const char infer_threads_pinning_message[] = "Optional. Enable threads->cores (\"YES\", default), threads->(NUMA)nodes (\"NUMA\") "
-                                                    "or completely disable (\"NO\") "
-                                                    "CPU threads pinning for CPU-involved inference.";
-
+static const char infer_threads_pinning_message[] =
+    "Optional. Explicit inference threads binding options (leave empty to let the OpenVINO to make a choice):\n"
+    "\t\t\t\tenabling threads->cores pinning(\"YES\", which is already default for any conventional CPU), \n"
+    "\t\t\t\tletting the runtime to decide on the threads->different core types(\"HYBRID_AWARE\", which is default on the hybrid CPUs) \n"
+    "\t\t\t\tthreads->(NUMA)nodes(\"NUMA\") or \n"
+    "\t\t\t\tcompletely disable(\"NO\") CPU inference threads pinning";
 // @brief message for stream_output option
 static const char stream_output_message[] = "Optional. Print progress as a plain text. When specified, an interactive progress bar is "
                                             "replaced with a "
@@ -187,7 +189,7 @@ DEFINE_bool(enforcebf16, false, enforce_bf16_message);
 DEFINE_uint32(b, 0, batch_size_message);
 
 // @brief Enable plugin messages
-DEFINE_string(pin, "YES", infer_threads_pinning_message);
+DEFINE_string(pin, "", infer_threads_pinning_message);
 
 /// @brief Enables multiline text output instead of progress bar
 DEFINE_bool(stream_output, false, stream_output_message);
@@ -264,7 +266,7 @@ static void showUsage() {
     std::cout << "    -nstreams \"<integer>\"     " << infer_num_streams_message << std::endl;
     std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;
     std::cout << "    -enforcebf16=<true/false>     " << enforce_bf16_message << std::endl;
-    std::cout << "    -pin \"YES\"/\"NO\"/\"NUMA\"    " << infer_threads_pinning_message << std::endl;
+    std::cout << "    -pin \"YES\"/\"HYBRID_AWARE\"/\"NO\"/\"NUMA\"   " << infer_threads_pinning_message << std::endl;
     std::cout << std::endl << "  Statistics dumping options:" << std::endl;
     std::cout << "    -report_type \"<type>\"     " << report_type_message << std::endl;
     std::cout << "    -report_folder            " << report_folder_message << std::endl;
diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp
index 7b540bb60f3..5a34e558cd3 100644
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@@ -267,9 +267,6 @@ int main(int argc, char* argv[]) {
                     if ((device_name.find("MULTI") != std::string::npos) && (device_name.find("GPU") != std::string::npos)) {
                         slog::warn << "Turn off threads pinning for " << device << " device since multi-scenario with GPU device is used." << slog::endl;
                         device_config[CONFIG_KEY(CPU_BIND_THREAD)] = CONFIG_VALUE(NO);
-                    } else {
-                        // set to default value
-                        device_config[CONFIG_KEY(CPU_BIND_THREAD)] = FLAGS_pin;
                     }
                 }
 
diff --git a/inference-engine/src/inference_engine/ie_system_conf.cpp b/inference-engine/src/inference_engine/ie_system_conf.cpp
index 8b5bbbb0e79..c4fbe597aa3 100644
--- a/inference-engine/src/inference_engine/ie_system_conf.cpp
+++ b/inference-engine/src/inference_engine/ie_system_conf.cpp
@@ -90,9 +90,9 @@ bool checkOpenMpEnvVars(bool includeOMPNumThreads) {
 #if defined(__APPLE__)
 // for Linux and Windows the getNumberOfCPUCores (that accounts only for physical cores) implementation is OS-specific
 // (see cpp files in corresponding folders), for __APPLE__ it is default :
-int getNumberOfCPUCores() { return parallel_get_max_threads();}
+int getNumberOfCPUCores(bool) { return parallel_get_max_threads();}
 #if !((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO))
-std::vector<int> getAvailableNUMANodes() { return {0}; }
+std::vector<int> getAvailableNUMANodes() { return {-1}; }
 #endif
 #endif
 
@@ -100,6 +100,15 @@ std::vector<int> getAvailableNUMANodes() { return {0}; }
 std::vector<int> getAvailableNUMANodes() {
     return custom::info::numa_nodes();
 }
+// this is impl only with the TBB
+std::vector<int> getAvailableCoresTypes() {
+    return custom::info::core_types();
+}
+#else
+// as the core types support exists only with the TBB, the fallback is same for any other threading API
+std::vector<int> getAvailableCoresTypes() {
+    return {-1};
+}
 #endif
 
 std::exception_ptr& CurrentException() {
diff --git a/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp b/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp
index c90fe681903..fd33bcf2862 100644
--- a/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp
+++ b/inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp
@@ -7,11 +7,12 @@
 #include <string>
 #include <vector>
 #include <iostream>
-#include <sched.h>
-#include "ie_system_conf.h"
-#include "ie_parallel.hpp"
-#include "ie_common.h"
 #include <numeric>
+#include <sched.h>
+
+#include "ie_common.h"
+#include "ie_system_conf.h"
+#include "threading/ie_parallel_custom_arena.hpp"
 
 
 namespace InferenceEngine {
@@ -61,7 +62,7 @@ std::vector<int> getAvailableNUMANodes() {
     return nodes;
 }
 #endif
-int getNumberOfCPUCores() {
+int getNumberOfCPUCores(bool bigCoresOnly) {
     unsigned numberOfProcessors = cpu._processors;
     unsigned totalNumberOfCpuCores = cpu._cores;
     IE_ASSERT(totalNumberOfCpuCores != 0);
@@ -81,7 +82,16 @@ int getNumberOfCPUCores() {
             }
         }
     }
-    return CPU_COUNT(&currentCoreSet);
+    int phys_cores = CPU_COUNT(&currentCoreSet);
+    #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+    auto core_types = custom::info::core_types();
+    if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ {
+        phys_cores = custom::info::default_concurrency(custom::task_arena::constraints{}
+                                                               .set_core_type(core_types.back())
+                                                               .set_max_threads_per_core(1));
+    }
+    #endif
+    return phys_cores;
 }
 
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/os/win/win_system_conf.cpp b/inference-engine/src/inference_engine/os/win/win_system_conf.cpp
index 97860492c0a..0ba0c9636d8 100644
--- a/inference-engine/src/inference_engine/os/win/win_system_conf.cpp
+++ b/inference-engine/src/inference_engine/os/win/win_system_conf.cpp
@@ -10,10 +10,10 @@
 #include <memory>
 #include <vector>
 #include "ie_system_conf.h"
-#include "ie_parallel.hpp"
+#include "threading/ie_parallel_custom_arena.hpp"
 
 namespace InferenceEngine {
-int getNumberOfCPUCores() {
+int getNumberOfCPUCores(bool bigCoresOnly) {
     const int fallback_val = parallel_get_max_threads();
     DWORD sz = 0;
     // querying the size of the resulting structure, passing the nullptr for the buffer
@@ -32,12 +32,21 @@ int getNumberOfCPUCores() {
         offset += reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(ptr.get() + offset)->Size;
         phys_cores++;
     } while (offset < sz);
+
+    #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+    auto core_types = custom::info::core_types();
+    if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ {
+        phys_cores = custom::info::default_concurrency(custom::task_arena::constraints{}
+                                                               .set_core_type(core_types.back())
+                                                               .set_max_threads_per_core(1));
+    }
+    #endif
     return phys_cores;
 }
 
 #if !(IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
 // OMP/SEQ threading on the Windows doesn't support NUMA
-std::vector<int> getAvailableNUMANodes() { return std::vector<int>(1, 0); }
+std::vector<int> getAvailableNUMANodes() { return {-1}; }
 #endif
 
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp b/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp
index 5c9bdde9b61..ddd4f9cb2c4 100644
--- a/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp
+++ b/inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp
@@ -71,19 +71,30 @@ struct CPUStreamsExecutor::Impl {
                     ((_impl->_config._streams + _impl->_usedNumaNodes.size() - 1)/_impl->_usedNumaNodes.size()))
                 : _impl->_usedNumaNodes.at(_streamId % _impl->_usedNumaNodes.size());
 #if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO
-            auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic : _impl->_config._threadsPerStream;
+            const auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic : _impl->_config._threadsPerStream;
             if (ThreadBindingType::HYBRID_AWARE == _impl->_config._threadBindingType) {
-                _taskArena.reset(new custom::task_arena{
-                    custom::task_arena::constraints{}
-                        .set_core_type(custom::info::core_types().back())
-                        .set_max_concurrency(concurrency)
-                });
+                if (Config::PreferredCoreType::ROUND_ROBIN != _impl->_config._threadPreferredCoreType) {
+                   if (Config::PreferredCoreType::ANY == _impl->_config._threadPreferredCoreType) {
+                       _taskArena.reset(new custom::task_arena{concurrency});
+                   } else {
+                       const auto selected_core_type = Config::PreferredCoreType::BIG == _impl->_config._threadPreferredCoreType
+                           ? custom::info::core_types().back() // running on Big cores only
+                           : custom::info::core_types().front(); // running on Little cores only
+                       _taskArena.reset(new custom::task_arena{
+                           custom::task_arena::constraints{}.set_core_type(selected_core_type).set_max_concurrency(concurrency)});
+                   }
+                } else {
+                    // assigning the stream to the core type in the round-robin fashion
+                    // wrapping around total_streams (i.e. how many streams all different core types can handle together)
+                    const auto total_streams = _impl->total_streams_on_core_types.back().second;
+                    const auto streamId_wrapped = _streamId % total_streams;
+                    const auto& selected_core_type = std::find_if(_impl->total_streams_on_core_types.cbegin(), _impl->total_streams_on_core_types.cend(),
+                        [streamId_wrapped](const decltype(_impl->total_streams_on_core_types)::value_type & p) { return p.second > streamId_wrapped; })->first;
+                    _taskArena.reset(new custom::task_arena{
+                        custom::task_arena::constraints{}.set_core_type(selected_core_type).set_max_concurrency(concurrency)});
+                }
             } else if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) {
-                _taskArena.reset(new custom::task_arena{
-                    custom::task_arena::constraints{}
-                        .set_numa_id(_numaNodeId)
-                        .set_max_concurrency(concurrency)
-                });
+                _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{_numaNodeId, concurrency}});
             } else if ((0 != _impl->_config._threadsPerStream) || (ThreadBindingType::CORES == _impl->_config._threadBindingType)) {
                 _taskArena.reset(new custom::task_arena{concurrency});
                 if (ThreadBindingType::CORES == _impl->_config._threadBindingType) {
@@ -164,6 +175,25 @@ struct CPUStreamsExecutor::Impl {
         } else {
             _usedNumaNodes = numaNodes;
         }
+        #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+        if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) {
+            const auto core_types = custom::info::core_types();
+            const int threadsPerStream = (0 == config._threadsPerStream) ? std::thread::hardware_concurrency() : config._threadsPerStream;
+            int sum = 0;
+            // reversed order, so BIG cores are first
+            for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) {
+                const auto& type = *iter;
+                // calculating the #streams per core type
+                const int num_streams_for_core_type = std::max(1,
+                        custom::info::default_concurrency(
+                                custom::task_arena::constraints{}.set_core_type(type)) / threadsPerStream);
+                sum += num_streams_for_core_type;
+                // prefix sum, so the core type for a given stream id will be deduced just as a upper_bound
+                // (notice that the map keeps the elements in the descending order, so the big cores are populated first)
+                total_streams_on_core_types.push_back({type, sum});
+            }
+        }
+        #endif
         for (auto streamId = 0; streamId < _config._streams; ++streamId) {
             _threads.emplace_back([this, streamId] {
                 openvino::itt::threadName(_config._name + "_" + std::to_string(streamId));
@@ -232,6 +262,14 @@ struct CPUStreamsExecutor::Impl {
     bool                                    _isStopped = false;
     std::vector<int>                        _usedNumaNodes;
     ThreadLocal<std::shared_ptr<Stream>>    _streams;
+    #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+    // stream id mapping to the core type
+    // stored in the reversed order (so the big cores, with the highest core_type_id value, are populated first)
+    // every entry is the core type and #streams that this AND ALL EARLIER entries can handle (prefix sum)
+    // (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams)
+    using StreamIdToCoreTypes = std::vector<std::pair<custom::core_type_id, int>>;
+    StreamIdToCoreTypes total_streams_on_core_types;
+    #endif
 };
 
 
diff --git a/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp b/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp
index 98b4dab8ecc..0393b7732f5 100644
--- a/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp
+++ b/inference-engine/src/inference_engine/threading/ie_executor_manager.cpp
@@ -36,6 +36,8 @@ IStreamsExecutor::Ptr ExecutorManagerImpl::getIdleCPUStreamsExecutor(const IStre
             executorConfig._threadBindingType == config._threadBindingType &&
             executorConfig._threadBindingStep == config._threadBindingStep &&
             executorConfig._threadBindingOffset == config._threadBindingOffset)
+            if (executorConfig._threadBindingType != IStreamsExecutor::ThreadBindingType::HYBRID_AWARE
+                 || executorConfig._threadPreferredCoreType == config._threadPreferredCoreType)
             return executor;
     }
     auto newExec = std::make_shared<CPUStreamsExecutor>(config);
diff --git a/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp b/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp
index 96d3860001d..1a2993f3365 100644
--- a/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp
+++ b/inference-engine/src/inference_engine/threading/ie_istreams_executor.cpp
@@ -6,6 +6,7 @@
 #include "ie_plugin_config.hpp"
 #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
 #include "ie_parallel.hpp"
+#include "ie_parallel_custom_arena.hpp"
 #include "ie_system_conf.h"
 #include "ie_parameter.hpp"
 #include <string>
@@ -29,32 +30,27 @@ std::vector<std::string> IStreamsExecutor::Config::SupportedKeys() {
 void IStreamsExecutor::Config::SetConfig(const std::string& key, const std::string& value) {
         if (key == CONFIG_KEY(CPU_BIND_THREAD)) {
             if (value == CONFIG_VALUE(YES) || value == CONFIG_VALUE(NUMA)) {
-#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) && (TBB_INTERFACE_VERSION < 11100)
-                if (value == CONFIG_VALUE(NUMA))
-                    IE_THROW() << CONFIG_KEY(CPU_BIND_THREAD) << " property value was set to NUMA. But IE was built with "
-                                       << "TBB version without NUMA-aware API. Current TBB API version is " << TBB_INTERFACE_VERSION
-                                       << ", required API version 11100 or greater.";
-#endif
-
-#if (defined(__APPLE__) || defined(_WIN32))
-                // on the Windows and Apple the CORES and NUMA pinning options are the same
+                #if (defined(__APPLE__) || defined(_WIN32))
                 _threadBindingType = IStreamsExecutor::ThreadBindingType::NUMA;
-#else
+                #else
                 _threadBindingType = (value == CONFIG_VALUE(YES))
                         ? IStreamsExecutor::ThreadBindingType::CORES : IStreamsExecutor::ThreadBindingType::NUMA;
-#endif
+                #endif
+            } else if (value == CONFIG_VALUE(HYBRID_AWARE)) {
+                _threadBindingType = IStreamsExecutor::ThreadBindingType::HYBRID_AWARE;
             } else if (value == CONFIG_VALUE(NO)) {
                 _threadBindingType = IStreamsExecutor::ThreadBindingType::NONE;
             } else {
                 IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_BIND_THREAD)
-                                   << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes)";
+                                   << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes) / "
+                                                        "HYBRID_AWARE (let the runtime recognize and use the hybrid cores)";
             }
         } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) {
             if (value == CONFIG_VALUE(CPU_THROUGHPUT_NUMA)) {
                 _streams = static_cast<int>(getAvailableNUMANodes().size());
             } else if (value == CONFIG_VALUE(CPU_THROUGHPUT_AUTO)) {
                 const int sockets = static_cast<int>(getAvailableNUMANodes().size());
-                // bare minimum of streams (that evenly divides available number of core)
+                // bare minimum of streams (that evenly divides available number of cores)
                 const int num_cores = sockets == 1 ? std::thread::hardware_concurrency() : getNumberOfCPUCores();
                 if (0 == num_cores % 4)
                     _streams = std::max(4, num_cores / 4);
@@ -138,12 +134,52 @@ Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) {
     return {};
 }
 
-IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial) {
+IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial, const bool fp_intesive) {
     const auto envThreads = parallel_get_env_threads();
     const auto& numaNodes = getAvailableNUMANodes();
-    const auto numaNodesNum = numaNodes.size();
+    const int numaNodesNum = numaNodes.size();
     auto streamExecutorConfig = initial;
-    const auto hwCores = streamExecutorConfig._streams > 1 && numaNodesNum == 1 ? parallel_get_max_threads() : getNumberOfCPUCores();
+    const bool bLatencyCase = streamExecutorConfig._streams <= numaNodesNum;
+
+    // by default, do not use the hyper-threading (to minimize threads synch overheads)
+    int num_cores_default = getNumberOfCPUCores();
+    #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+    //additional latency-case logic for hybrid processors:
+    if (ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) {
+        const auto core_types = custom::info::core_types();
+        const auto num_little_cores = custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.front()));
+        const auto num_big_cores_phys = getNumberOfCPUCores(true);
+        const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
+        const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
+        // by default the latency case uses (faster) Big cores only, depending on the compute ratio
+        const bool bLatencyCaseBigOnly = num_big_cores_phys > (num_little_cores / (fp_intesive ? fp32_threshold : int8_threshold));
+        // selecting the preferred core type
+        streamExecutorConfig._threadPreferredCoreType =
+            bLatencyCase
+                ? (bLatencyCaseBigOnly
+                    ? IStreamsExecutor::Config::PreferredCoreType::BIG
+                    : IStreamsExecutor::Config::PreferredCoreType::ANY)
+                : IStreamsExecutor::Config::PreferredCoreType::ROUND_ROBIN;
+        // additionally selecting the #cores to use in the "Big-only" case
+        if (bLatencyCaseBigOnly) {
+            const int hyper_threading_threshold = 2; // min #cores, for which the hyper-threading becomes useful for the latency case
+            const auto num_big_cores = custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back()));
+            num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys;
+        }
+    }
+    #endif
+    const auto hwCores = !bLatencyCase && numaNodesNum == 1
+        // throughput case on a single-NUMA node machine uses all available cores
+        ? parallel_get_max_threads()
+        // in the rest of cases:
+        //    multi-node machine
+        //    or
+        //    latency case, single-node yet hybrid case that uses
+        //      all core types
+        //      or
+        //      big-cores only, but the #cores is "enough" (pls see the logic above)
+        // it is usually beneficial not to use the hyper-threading (which is default)
+        : num_cores_default;
     const auto threads = streamExecutorConfig._threads ? streamExecutorConfig._threads : (envThreads ? envThreads : hwCores);
     streamExecutorConfig._threadsPerStream = streamExecutorConfig._streams
                                             ? std::max(1, threads/streamExecutorConfig._streams)
diff --git a/inference-engine/src/mkldnn_plugin/config.cpp b/inference-engine/src/mkldnn_plugin/config.cpp
index 920a5ff2788..16eb4c9e130 100644
--- a/inference-engine/src/mkldnn_plugin/config.cpp
+++ b/inference-engine/src/mkldnn_plugin/config.cpp
@@ -13,7 +13,6 @@
 #include "ie_parallel.hpp"
 #include "ie_system_conf.h"
 
-#include <cpp_interfaces/exception2status.hpp>
 #include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
 
 namespace MKLDNNPlugin {
@@ -21,16 +20,20 @@ namespace MKLDNNPlugin {
 using namespace InferenceEngine;
 
 Config::Config() {
-#if (defined(__APPLE__) || defined(_WIN32))
-#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) && (TBB_INTERFACE_VERSION >= 11100)
-    // If we sure that TBB has NUMA aware API part.
-    streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::NUMA;
-#else
-    streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::NONE;
-#endif
-#else
+    // this is default mode
     streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::CORES;
-#endif
+
+    // for the TBB code-path, additional configuration depending on the OS and CPU types
+    #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+        #if defined(__APPLE__) || defined(_WIN32)
+        // 'CORES' is not implemented for Win/MacOS; so the 'NUMA' is default
+        streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::NUMA;
+        #endif
+
+        if (getAvailableCoresTypes().size() > 1 /*Hybrid CPU*/) {
+            streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::HYBRID_AWARE;
+        }
+    #endif
 
     if (!with_cpu_x86_bfloat16())
         enforceBF16 = false;
@@ -128,7 +131,7 @@ void Config::updateProperties() {
                 _config.insert({ PluginConfigParams::KEY_CPU_BIND_THREAD, PluginConfigParams::NUMA });
             break;
             case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE:
-                _config.insert({ PluginConfigParams::KEY_CPU_BIND_THREAD, PluginConfigParams::HYBRID_AWARE});
+                _config.insert({ PluginConfigParams::KEY_CPU_BIND_THREAD, PluginConfigParams::HYBRID_AWARE });
             break;
         }
         if (collectPerfCounters == true)
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
index f7f02a57615..156317c1d73 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
@@ -49,11 +49,11 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
     // we are cloning network if we have statistics and we can transform network.
     _clonedNetwork = cloneNetwork(network);
 
+    bool isFloatModel = true;
     if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
         // Check if network is INT8 or Binary.
         // BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
         // BF16 + INT8 or BF16 + BIN.
-        bool isFloatModel = true;
         CNNNetworkIterator iter(network);
         while (iter != CNNNetworkIterator()) {
             if (CaselessEq<std::string>()((*iter)->type, "FakeQuantize")) {
@@ -229,7 +229,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
         // special case when all InferRequests are muxed into a single queue
         _taskExecutor = InferenceEngine::ExecutorManager::getInstance()->getExecutor("CPU");
     } else {
-        auto streamsExecutorConfig = InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(_cfg.streamExecutorConfig);
+        auto streamsExecutorConfig = InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(_cfg.streamExecutorConfig, isFloatModel);
         streamsExecutorConfig._name = "CPUStreamsExecutor";
         _taskExecutor = InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor(streamsExecutorConfig);
     }
diff --git a/inference-engine/src/plugin_api/ie_system_conf.h b/inference-engine/src/plugin_api/ie_system_conf.h
index 981b4dda961..93d633c35f9 100644
--- a/inference-engine/src/plugin_api/ie_system_conf.h
+++ b/inference-engine/src/plugin_api/ie_system_conf.h
@@ -37,12 +37,23 @@ INFERENCE_ENGINE_API_CPP(bool) checkOpenMpEnvVars(bool includeOMPNumThreads = tr
 INFERENCE_ENGINE_API_CPP(std::vector<int>) getAvailableNUMANodes();
 
 /**
- * @brief      Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance friendly for servers)
- *             (on other OSes it simply relies on the original parallel API of choice, which usually uses the logical cores )
+ * @brief      Returns available CPU cores types (on Linux, and Windows) and ONLY with TBB, single core type is assumed otherwise
  * @ingroup    ie_dev_api_system_conf
+ * @return     Vector of core types
+ */
+INFERENCE_ENGINE_API_CPP(std::vector<int>) getAvailableCoresTypes();
+
+/**
+ * @brief      Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance friendly for servers)
+ *             (on other OSes it simply relies on the original parallel API of choice, which usually uses the logical cores).
+ *                     call function with 'false' to get #phys cores of all types
+ *                     call function with 'true' to get #phys 'Big' cores
+ *                     number of 'Little' = 'all' - 'Big'
+ * @ingroup    ie_dev_api_system_conf
+ * @param[in]  bigCoresOnly Additionally limits the number of reported cores to the 'Big' cores only.
  * @return     Number of physical CPU cores.
  */
-INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores();
+INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(bool bigCoresOnly = false);
 
 /**
  * @brief      Checks whether CPU supports SSE 4.2 capability
diff --git a/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp b/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp
index 67a601bb772..4dd80f411bc 100644
--- a/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp
+++ b/inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp
@@ -36,7 +36,7 @@ public:
     using Ptr = std::shared_ptr<IStreamsExecutor>;
 
     /**
-     * @brief Defines thread binding type
+     * @brief Defines inference thread binding type
      */
     enum ThreadBindingType : std::uint8_t {
         NONE,    //!< Don't bind the inference threads
@@ -74,9 +74,11 @@ public:
         * @brief Create appropriate multithreaded configuration
         *        filing unconfigured values from initial configuration using hardware properties
         * @param initial Inital configuration
+        * @param fp_intesive additional hint for the the (Hybrid) core-types selection logic
+         *       whether the executor should be configured for floating point intensive work (as opposite to int8 intensive)
         * @return configured values
         */
-        static Config MakeDefaultMultiThreaded(const Config& initial);
+        static Config MakeDefaultMultiThreaded(const Config& initial, const bool fp_intesive = true);
 
         std::string        _name;  //!< Used by `ITT` to name executor threads
         int                _streams                 = 1;  //!< Number of streams.
@@ -85,6 +87,12 @@ public:
         int                _threadBindingStep       = 1;  //!< In case of @ref CORES binding offset type thread binded to cores with defined step
         int                _threadBindingOffset     = 0;  //!< In case of @ref CORES binding offset type thread binded to cores starting from offset
         int                _threads                 = 0;  //!< Number of threads distributed between streams. Reserved. Should not be used.
+        enum PreferredCoreType {
+            ANY,
+            LITTLE,
+            BIG,
+            ROUND_ROBIN // used w/multiple streams to populate the Big cores first, then the Little, then wrap around (for large #streams)
+        }                  _threadPreferredCoreType = PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize
 
         /**
          * @brief      A constructor with arguments
@@ -96,6 +104,7 @@ public:
          * @param[in]  threadBindingStep    @copybrief Config::_threadBindingStep
          * @param[in]  threadBindingOffset  @copybrief Config::_threadBindingOffset
          * @param[in]  threads              @copybrief Config::_threads
+         * @param[in]  threadPreferBigCores @copybrief Config::_threadPreferBigCores
          */
         Config(
             std::string        name                    = "StreamsExecutor",
@@ -104,14 +113,15 @@ public:
             ThreadBindingType  threadBindingType       = ThreadBindingType::NONE,
             int                threadBindingStep       = 1,
             int                threadBindingOffset     = 0,
-            int                threads                 = 0) :
+            int                threads                 = 0,
+            PreferredCoreType  threadPreferredCoreType = PreferredCoreType::ANY) :
         _name{name},
         _streams{streams},
         _threadsPerStream{threadsPerStream},
         _threadBindingType{threadBindingType},
         _threadBindingStep{threadBindingStep},
         _threadBindingOffset{threadBindingOffset},
-        _threads{threads} {
+        _threads{threads}, _threadPreferredCoreType(threadPreferredCoreType){
         }
     };
 
diff --git a/tools/benchmark/main.py b/tools/benchmark/main.py
index bdfe296e331..998e95638fb 100644
--- a/tools/benchmark/main.py
+++ b/tools/benchmark/main.py
@@ -142,9 +142,6 @@ def run(args):
                         logger.warning(f"Turn off threads pinning for {device} " +
                                        "device since multi-scenario with GPU device is used.")
                         config[device]['CPU_BIND_THREAD'] = 'NO'
-                    else:
-                        ## set to default value
-                        config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
 
                 ## for CPU execution, more throughput-oriented execution via streams
                 set_throughput_streams()
diff --git a/tools/benchmark/parameters.py b/tools/benchmark/parameters.py
index 1a4d8b84d6a..ca29fb5158c 100644
--- a/tools/benchmark/parameters.py
+++ b/tools/benchmark/parameters.py
@@ -91,8 +91,11 @@ def parse_args():
     args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None,
                       help='Number of threads to use for inference on the CPU, GNA '
                            '(including HETERO and MULTI cases).')
-    args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', choices=['YES', 'NO', 'NUMA'],
-                      help='Optional. Enable  threads->cores (\'YES\' is default value), threads->(NUMA)nodes (\'NUMA\') or completely  disable (\'NO\')'
+    args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False,  choices=['YES', 'NO', 'NUMA', 'HYBRID_AWARE'],
+                      help='Optional. Enable  threads->cores (\'YES\' which is OpenVINO runtime\'s default for conventional CPUs), '
+                           'threads->(NUMA)nodes (\'NUMA\'), '
+                           'threads->appropriate core types (\'HYBRID_AWARE\', which is OpenVINO runtime\'s default for Hybrid CPUs)'
+                           'or completely disable (\'NO\')' 
                            'CPU threads pinning for CPU-involved inference.')
     args.add_argument('-exec_graph_path', '--exec_graph_path', type=str, required=False,
                       help='Optional. Path to a file where to store executable graph information serialized.')