From acecf31642ab453bf52d938f46c9a7449fcaa177 Mon Sep 17 00:00:00 2001 From: Wanglei Shen Date: Tue, 28 Nov 2023 14:41:56 +0800 Subject: [PATCH] [CPU][ARM] Enable multi-stream execution (#21009) --- src/plugins/intel_cpu/src/config.cpp | 5 --- .../intel_cpu/src/cpu_streams_calculation.cpp | 35 ++++++++++++++++--- .../ov_executable_network/properties.cpp | 5 --- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index dcaf9e09c84..df2304c50e8 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -343,11 +343,6 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { streamExecutorConfig._streams_changed = true; } -#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) - // TODO: multi-stream execution has functional issues on ARM target - streamExecutorConfig._streams = 1; - streamExecutorConfig._streams_changed = true; -#endif this->modelType = modelType; CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties()); diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index 3430abe5c4d..0d22db15461 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -444,22 +444,49 @@ int get_model_prefer_threads(const int num_streams, const float L2_cache_size = dnnl::utils::get_cache_size(2 /*level*/, true /*per core */); ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(model, L2_cache_size, memThresholdAssumeLimitedForISA); - config.modelPreferThreads = ov::threading::IStreamsExecutor::Config::StreamMode::DEFAULT; + +#if (defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__) + config.modelPreferThreads = 1; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) || (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) { // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams - config.modelPreferThreads = 1; + config.modelPreferThreads = 4; } // otherwise (no recognized layers) falling back to the default value } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) { // network is below the ISA-specific threshold config.modelPreferThreads = 1; } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) { // network is below general threshold + config.modelPreferThreads = 1; + } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs > ov::MemBandwidthPressure::LIMITED && + networkToleranceForLowCache.ratio_compute_convs < ov::MemBandwidthPressure::ALL) { + config.modelPreferThreads = 4; + } else if (networkToleranceForLowCache.ratio_mem_limited_deconvs <= ov::MemBandwidthPressure::LIMITED && + networkToleranceForLowCache.ratio_mem_limited_convs <= ov::MemBandwidthPressure::LIMITED && + networkToleranceForLowCache.ratio_compute_convs > ov::MemBandwidthPressure::LIMITED) { config.modelPreferThreads = 2; } - if (config.modelPreferThreads == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 && sockets == 1) { - config.modelPreferThreads = 2; +#endif + + if (-1 == config.modelPreferThreads) { + config.modelPreferThreads = ov::threading::IStreamsExecutor::Config::StreamMode::DEFAULT; + if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { + if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) || + (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) { + // all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams + config.modelPreferThreads = 1; + } // otherwise (no recognized layers) falling back to the default value + } else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) { + // network is below the ISA-specific threshold + config.modelPreferThreads = 1; + } else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) { + // network is below general threshold + config.modelPreferThreads = 2; + } + if (config.modelPreferThreads == 1 && proc_type_table[0][EFFICIENT_CORE_PROC] == 0 && sockets == 1) { + config.modelPreferThreads = 2; + } } } diff --git a/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp index 6099648bca5..df70d78c11a 100644 --- a/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/behavior/ov_executable_network/properties.cpp @@ -147,11 +147,6 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelZeroStreams) { ASSERT_NO_THROW(value = compiledModel.get_property(ov::num_streams)); -#if defined(OPENVINO_ARCH_ARM) || \ - defined(OPENVINO_ARCH_ARM64) // Will be removed after multiple streams is supported on ARM - streams = 1; -#endif - ASSERT_EQ(streams, value); }