From acecf31642ab453bf52d938f46c9a7449fcaa177 Mon Sep 17 00:00:00 2001
From: Wanglei Shen <wanglei.shen@intel.com>
Date: Tue, 28 Nov 2023 14:41:56 +0800
Subject: [PATCH] [CPU][ARM] Enable multi-stream execution (#21009)

---
 src/plugins/intel_cpu/src/config.cpp          |  5 ---
 .../intel_cpu/src/cpu_streams_calculation.cpp | 35 ++++++++++++++++---
 .../ov_executable_network/properties.cpp      |  5 ---
 3 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index dcaf9e09c84..df2304c50e8 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -343,11 +343,6 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         streamExecutorConfig._streams_changed = true;
     }
 
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
-    // TODO: multi-stream execution has functional issues on ARM target
-    streamExecutorConfig._streams = 1;
-    streamExecutorConfig._streams_changed = true;
-#endif
     this->modelType = modelType;
 
     CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
index 3430abe5c4d..0d22db15461 100644
--- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
+++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
@@ -444,22 +444,49 @@ int get_model_prefer_threads(const int num_streams,
         const float L2_cache_size = dnnl::utils::get_cache_size(2 /*level*/, true /*per core */);
         ov::MemBandwidthPressure networkToleranceForLowCache =
             ov::MemBandwidthPressureTolerance(model, L2_cache_size, memThresholdAssumeLimitedForISA);
-        config.modelPreferThreads = ov::threading::IStreamsExecutor::Config::StreamMode::DEFAULT;
+
+#if (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)
+        config.modelPreferThreads = 1;
         if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
             if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) ||
                 (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) {
                 // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
-                config.modelPreferThreads = 1;
+                config.modelPreferThreads = 4;
             }  // otherwise (no recognized layers) falling back to the default value
         } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
             // network is below the ISA-specific threshold
             config.modelPreferThreads = 1;
         } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
             // network is below general threshold
+            config.modelPreferThreads = 1;
+        } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED &&
+                   networkToleranceForLowCache.ratio_compute_convs < ov::MemBandwidthPressure::ALL) {
+            config.modelPreferThreads = 4;
+        } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs <= ov::MemBandwidthPressure::LIMITED &&
+                   networkToleranceForLowCache.ratio_mem_limited_convs <= ov::MemBandwidthPressure::LIMITED &&
+                   networkToleranceForLowCache.ratio_compute_convs > ov::MemBandwidthPressure::LIMITED) {
             config.modelPreferThreads = 2;
         }
-        if (config.modelPreferThreads == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 && sockets == 1) {
-            config.modelPreferThreads = 2;
+#endif
+
+        if (-1 == config.modelPreferThreads) {
+            config.modelPreferThreads = ov::threading::IStreamsExecutor::Config::StreamMode::DEFAULT;
+            if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
+                if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) ||
+                    (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) {
+                    // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
+                    config.modelPreferThreads = 1;
+                }  // otherwise (no recognized layers) falling back to the default value
+            } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
+                // network is below the ISA-specific threshold
+                config.modelPreferThreads = 1;
+            } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
+                // network is below general threshold
+                config.modelPreferThreads = 2;
+            }
+            if (config.modelPreferThreads == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 && sockets == 1) {
+                config.modelPreferThreads = 2;
+            }
         }
     }
 
diff --git a/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp
index 6099648bca5..df70d78c11a 100644
--- a/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp
@@ -147,11 +147,6 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelZeroStreams) {
 
     ASSERT_NO_THROW(value = compiledModel.get_property(ov::num_streams));
 
-#if defined(OPENVINO_ARCH_ARM) || \
-    defined(OPENVINO_ARCH_ARM64)  // Will be removed after multiple streams is supported on ARM
-    streams = 1;
-#endif
-
     ASSERT_EQ(streams, value);
 }