[CPU] Fix performance hint property handling (#10351)
This commit is contained in:
parent
2d3bd40c3d
commit
3fcff15166
@ -313,10 +313,10 @@ InferenceEngine::Parameter MKLDNNExecNetwork::GetMetric(const std::string &name)
|
||||
return graph.dump()->get_friendly_name();
|
||||
} else if (name == ov::optimal_number_of_infer_requests) {
|
||||
const auto streams = config.streamExecutorConfig._streams;
|
||||
return static_cast<uint32_t>(streams); // ov::optimal_number_of_infer_requests has no negative values
|
||||
return decltype(ov::optimal_number_of_infer_requests)::value_type(streams); // ov::optimal_number_of_infer_requests has no negative values
|
||||
} else if (name == ov::num_streams) {
|
||||
const auto streams = config.streamExecutorConfig._streams;
|
||||
return static_cast<int32_t>(streams); // ov::num_streams has special negative values (AUTO = -1, NUMA = -2)
|
||||
return decltype(ov::num_streams)::value_type(streams); // ov::num_streams has special negative values (AUTO = -1, NUMA = -2)
|
||||
} else if (name == ov::affinity) {
|
||||
const auto affinity = config.streamExecutorConfig._threadBindingType;
|
||||
switch (affinity) {
|
||||
|
@ -539,6 +539,90 @@ static void Transformation(CNNNetwork& clonedNetwork, const bool _enableLPT, con
|
||||
ConvertToCPUSpecificOpset(nGraphFunc);
|
||||
}
|
||||
|
||||
static bool streamsSet(const std::map<std::string, std::string>& config) {
|
||||
return config.count(PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) ||
|
||||
config.count(ov::num_streams.name());
|
||||
}
|
||||
|
||||
void Engine::ApplyPerformanceHints(std::map<std::string, std::string> &config, const std::shared_ptr<ngraph::Function>& ngraphFunc) const {
|
||||
const bool streamsExplicitlySetForModel = streamsSet(config);
|
||||
// checking streams (to avoid overriding what user might explicitly set in the incoming config or previously via SetConfig)
|
||||
if (streamsExplicitlySetForModel ||
|
||||
streamsExplicitlySetForEngine)
|
||||
return;
|
||||
|
||||
const auto& mode = config.find(CONFIG_KEY(PERFORMANCE_HINT));
|
||||
// the mode may have just arrived to the LoadNetwork, or was set with the plugin's SetConfig
|
||||
if (mode == config.end() && engConfig.perfHintsConfig.ovPerfHint.empty())
|
||||
return;
|
||||
/* performance hints set for network has higher pririty than engine ones.
|
||||
* This applies for all the configuration parameters */
|
||||
const auto mode_name = (mode != config.end()) ?
|
||||
PerfHintsConfig::CheckPerformanceHintValue(mode->second) :
|
||||
engConfig.perfHintsConfig.ovPerfHint;
|
||||
|
||||
if (mode_name == CONFIG_VALUE(LATENCY)) {
|
||||
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = CONFIG_VALUE(CPU_THROUGHPUT_NUMA);
|
||||
} else if (mode_name == CONFIG_VALUE(THROUGHPUT)) {
|
||||
const auto isa = dnnl::get_effective_cpu_isa();
|
||||
float isaSpecificThreshold = 1.0f;
|
||||
switch (isa) {
|
||||
case dnnl::cpu_isa::sse41 :
|
||||
isaSpecificThreshold = 0.5f;
|
||||
break;
|
||||
case dnnl::cpu_isa::avx2:
|
||||
case dnnl::cpu_isa::avx512_core:
|
||||
isaSpecificThreshold = 1.0f;
|
||||
break;
|
||||
case dnnl::cpu_isa::avx512_core_vnni:
|
||||
case dnnl::cpu_isa::avx2_vnni:
|
||||
isaSpecificThreshold = 2.0f;
|
||||
break;
|
||||
case dnnl::cpu_isa::avx512_core_amx:
|
||||
isaSpecificThreshold = 4.0f;
|
||||
break;
|
||||
default:
|
||||
isaSpecificThreshold = 1.0f;
|
||||
}
|
||||
// the more "capable" the CPU in general, the more streams we may want to keep to keep it utilized
|
||||
const float memThresholdAssumeLimitedForISA = ov::MemBandwidthPressure::LIMITED/isaSpecificThreshold;
|
||||
const float L2_cache_size = mkldnn::utils::get_cache_size(2 /*level*/, true /*per core */);
|
||||
ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(
|
||||
ngraphFunc,
|
||||
L2_cache_size, memThresholdAssumeLimitedForISA);
|
||||
// num of phys CPU cores (most aggressive value for #streams)
|
||||
const auto num_cores = getNumberOfCPUCores();
|
||||
// less aggressive
|
||||
const auto num_streams_less_aggressive = num_cores / 2;
|
||||
// default #streams value (most conservative)
|
||||
const auto default_num_streams = IStreamsExecutor::Config::GetDefaultNumStreams();
|
||||
int num_streams = default_num_streams;
|
||||
if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
|
||||
if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL)
|
||||
|| (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) {
|
||||
// all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
|
||||
num_streams = num_cores;
|
||||
} // otherwise (no recognized layers) falling back to the default value
|
||||
} else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
|
||||
// network is below the ISA-specific threshold
|
||||
num_streams = num_cores;
|
||||
} else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
|
||||
// network is below general threshold
|
||||
num_streams = std::max(default_num_streams, num_streams_less_aggressive);
|
||||
}
|
||||
auto num_requests = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS));
|
||||
if (num_requests != config.end()) { // arrived with config to the LoadNetwork (and thus higher pri)
|
||||
auto val = PerfHintsConfig::CheckPerformanceHintRequestValue(num_requests->second);
|
||||
if (val > 0)
|
||||
num_streams = std::min(num_streams, val);
|
||||
} else if (engConfig.perfHintsConfig.ovPerfHintNumRequests) { //set thru SetConfig to the plugin, 2nd priority
|
||||
num_streams = std::min(num_streams,
|
||||
engConfig.perfHintsConfig.ovPerfHintNumRequests);
|
||||
}
|
||||
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
|
||||
}
|
||||
}
|
||||
|
||||
InferenceEngine::IExecutableNetworkInternal::Ptr
|
||||
Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std::map<std::string, std::string> &orig_config) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "Engine::LoadExeNetworkImpl");
|
||||
@ -583,77 +667,8 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
|
||||
auto nGraphFunc = clonedNetwork.getFunction();
|
||||
TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableSnippets, isLegacyAPI());
|
||||
|
||||
// Here the OV perf modes are turned into specific settings (as we need the network for better params selection)
|
||||
const auto& mode = config.find(PluginConfigParams::KEY_PERFORMANCE_HINT);
|
||||
// the mode may have just arrived to the LoadNetwork, or was set with the plugins' SetConfig
|
||||
if (mode != config.end() || !engConfig.perfHintsConfig.ovPerfHint.empty()) {
|
||||
const auto mode_name = (mode != config.end())
|
||||
? PerfHintsConfig::CheckPerformanceHintValue(mode->second) : engConfig.perfHintsConfig.ovPerfHint;
|
||||
//checking streams (to avoid overriding what user might explicitly set in the incoming config or previously via SetConfig)
|
||||
const auto streams = config.find(PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS);
|
||||
if (streams == config.end() && !streamsSet) {
|
||||
if (mode_name == CONFIG_VALUE(LATENCY)) {
|
||||
config[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = CONFIG_VALUE(CPU_THROUGHPUT_NUMA);
|
||||
} else if (mode_name == CONFIG_VALUE(THROUGHPUT)) {
|
||||
const auto isa = dnnl::get_effective_cpu_isa();
|
||||
float isaSpecificThreshold = 1.0f;
|
||||
switch (isa) {
|
||||
case dnnl::cpu_isa::sse41 :
|
||||
isaSpecificThreshold = 0.5f;
|
||||
break;
|
||||
case dnnl::cpu_isa::avx2:
|
||||
case dnnl::cpu_isa::avx512_core:
|
||||
isaSpecificThreshold = 1.0f;
|
||||
break;
|
||||
case dnnl::cpu_isa::avx512_core_vnni:
|
||||
case dnnl::cpu_isa::avx2_vnni:
|
||||
isaSpecificThreshold = 2.0f;
|
||||
break;
|
||||
case dnnl::cpu_isa::avx512_core_amx:
|
||||
isaSpecificThreshold = 4.0f;
|
||||
break;
|
||||
default:
|
||||
isaSpecificThreshold = 1.0f;
|
||||
}
|
||||
// the more "capable" the CPU in general, the more streams we may want to keep to keep it utilized
|
||||
const float memThresholdAssumeLimitedForISA = ov::MemBandwidthPressure::LIMITED/isaSpecificThreshold;
|
||||
const float L2_cache_size = mkldnn::utils::get_cache_size(2 /*level*/, true /*per core */);
|
||||
ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(
|
||||
clonedNetwork.getFunction(),
|
||||
L2_cache_size, memThresholdAssumeLimitedForISA);
|
||||
// num of phys CPU cores (most aggressive value for #streams)
|
||||
const auto num_cores = getNumberOfCPUCores();
|
||||
// less aggressive
|
||||
const auto num_streams_less_aggressive = num_cores / 2;
|
||||
// default #streams value (most conservative)
|
||||
const auto default_num_streams = IStreamsExecutor::Config::GetDefaultNumStreams();
|
||||
int num_streams = default_num_streams;
|
||||
if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) {
|
||||
if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL)
|
||||
|| (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) {
|
||||
// all relevant layers (convs, etc) are compute-limited, the most aggressive val for #streams
|
||||
num_streams = num_cores;
|
||||
} // otherwise (no recognized layers) falling back to the default value
|
||||
} else if (networkToleranceForLowCache.max_mem_tolerance > memThresholdAssumeLimitedForISA) {
|
||||
// network is below the ISA-specific threshold
|
||||
num_streams = num_cores;
|
||||
} else if (networkToleranceForLowCache.max_mem_tolerance > ov::MemBandwidthPressure::LIMITED) {
|
||||
// network is below general threshold
|
||||
num_streams = std::max(default_num_streams, num_streams_less_aggressive);
|
||||
}
|
||||
auto num_requests = config.find(PluginConfigParams::KEY_PERFORMANCE_HINT_NUM_REQUESTS);
|
||||
if (num_requests != config.end()) { // arrived with config to the LoadNetwork (and thus higher pri)
|
||||
auto val = PerfHintsConfig::CheckPerformanceHintRequestValue(num_requests->second);
|
||||
if (val > 0)
|
||||
num_streams = std::min(num_streams, val);
|
||||
} else if (engConfig.perfHintsConfig.ovPerfHintNumRequests) { //set thru SetConfig to the plugin, 2nd priority
|
||||
num_streams = std::min(num_streams,
|
||||
engConfig.perfHintsConfig.ovPerfHintNumRequests);
|
||||
}
|
||||
config[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = std::to_string(num_streams);
|
||||
}
|
||||
}
|
||||
}
|
||||
ApplyPerformanceHints(config, nGraphFunc);
|
||||
|
||||
ConvertToCPUSpecificOpset(nGraphFunc);
|
||||
|
||||
// update the props after the perf mode translated to configs
|
||||
@ -669,7 +684,8 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
|
||||
}
|
||||
|
||||
void Engine::SetConfig(const std::map<std::string, std::string> &config) {
|
||||
streamsSet = (config.find(PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) != config.end());
|
||||
streamsExplicitlySetForEngine = streamsSet(config);
|
||||
|
||||
engConfig.readProperties(config);
|
||||
}
|
||||
|
||||
|
@ -47,10 +47,14 @@ private:
|
||||
|
||||
InferenceEngine::Parameter GetConfigLegacy(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const;
|
||||
|
||||
void ApplyPerformanceHints(std::map<std::string, std::string> &config, const std::shared_ptr<ngraph::Function>& ngraphFunc) const;
|
||||
|
||||
Config engConfig;
|
||||
NumaNodesWeights weightsSharing;
|
||||
MKLDNNExtensionManager::Ptr extensionManager = std::make_shared<MKLDNNExtensionManager>();
|
||||
bool streamsSet = false;
|
||||
/* Explicily configured streams have higher priority even than performance hints.
|
||||
So track if streams is set explicitly (not auto-configured) */
|
||||
bool streamsExplicitlySetForEngine = false;
|
||||
const std::string deviceFullName;
|
||||
};
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "functional_test_utils/skip_tests_config.hpp"
|
||||
#include <base/ov_behavior_test_utils.hpp>
|
||||
|
||||
#include "openvino/core/any.hpp"
|
||||
#include "openvino/runtime/core.hpp"
|
||||
#include "openvino/runtime/compiled_model.hpp"
|
||||
#include "openvino/runtime/properties.hpp"
|
||||
@ -17,8 +18,7 @@ namespace {
|
||||
//
|
||||
// Executable Network GetMetric
|
||||
//
|
||||
class OVClassConfigTestCPU : public ::testing::Test,
|
||||
public ::testing::WithParamInterface<std::tuple<std::string, std::pair<std::string, ov::Any>>> {
|
||||
class OVClassConfigTestCPU : public ::testing::Test {
|
||||
public:
|
||||
std::shared_ptr<ngraph::Function> model;
|
||||
const std::string deviceName = "CPU";
|
||||
@ -57,6 +57,62 @@ TEST_F(OVClassConfigTestCPU, smoke_SetROPropertiesThrow) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(OVClassConfigTestCPU, smoke_CheckCoreStreamsHasHigherPriorityThanThroughputHint) {
|
||||
ov::Core ie;
|
||||
int32_t streams = 1; // throughput hint should apply higher number of streams
|
||||
int32_t value;
|
||||
|
||||
OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams)));
|
||||
OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)));
|
||||
|
||||
ov::CompiledModel compiledModel = ie.compile_model(model, deviceName);
|
||||
ASSERT_NO_THROW(value = compiledModel.get_property(ov::num_streams));
|
||||
ASSERT_EQ(streams, value);
|
||||
}
|
||||
|
||||
TEST_F(OVClassConfigTestCPU, smoke_CheckCoreStreamsHasHigherPriorityThanLatencyHint) {
|
||||
ov::Core ie;
|
||||
int32_t streams = 4; // latency hint should apply lower number of streams
|
||||
int32_t value;
|
||||
|
||||
OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams)));
|
||||
OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)));
|
||||
|
||||
ov::CompiledModel compiledModel = ie.compile_model(model, deviceName);
|
||||
ASSERT_NO_THROW(value = compiledModel.get_property(ov::num_streams));
|
||||
ASSERT_EQ(streams, value);
|
||||
}
|
||||
|
||||
TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanLatencyHints) {
|
||||
ov::Core ie;
|
||||
int32_t streams = 4; // latency hint should apply lower number of streams
|
||||
int32_t value;
|
||||
|
||||
OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)));
|
||||
|
||||
ov::AnyMap config;
|
||||
config[ov::num_streams.name()] = streams;
|
||||
ov::CompiledModel compiledModel = ie.compile_model(model, deviceName, config);
|
||||
|
||||
ASSERT_NO_THROW(value = compiledModel.get_property(ov::num_streams));
|
||||
ASSERT_EQ(streams, value);
|
||||
}
|
||||
|
||||
TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanThroughputHint) {
|
||||
ov::Core ie;
|
||||
int32_t streams = 1; // throughput hint should apply higher number of streams
|
||||
int32_t value;
|
||||
|
||||
ov::AnyMap config;
|
||||
config[ov::hint::performance_mode.name()] = ov::hint::PerformanceMode::THROUGHPUT;
|
||||
config[ov::num_streams.name()] = streams;
|
||||
|
||||
ov::CompiledModel compiledModel = ie.compile_model(model, deviceName, config);
|
||||
|
||||
ASSERT_NO_THROW(value = compiledModel.get_property(ov::num_streams));
|
||||
ASSERT_EQ(streams, value);
|
||||
}
|
||||
|
||||
const std::vector<ov::AnyMap> multiDevicePriorityConfigs = {
|
||||
{ov::device::priorities(CommonTestUtils::DEVICE_CPU)}};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user