experimenting with DG1 on the batch size selection, also collecting the mem footprint

This commit is contained in:
myshevts 2021-10-07 11:43:15 +03:00
parent e7b743ac33
commit ac21d71321
4 changed files with 8 additions and 12 deletions

View File

@ -681,13 +681,11 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
} else if (name == METRIC_KEY(OPTIMAL_BATCH)) {
auto network = options.find("MODEL_ADDRESS")->second.as<InferenceEngine::CNNNetwork const*>();
auto networkCloned = CloneAndTransformNetwork(*network, _impl->m_config);
// i7_1185G7
const float L2_cache_size = 6*1024*1024;
// DG1
const float L3_cache_size = 12*1024*1024;
unsigned int batch = 1;
ov::MemBandwidthPressure memPressure = ov::MemBandwidthPressureTolerance(
networkCloned.getFunction(),
L2_cache_size, L3_cache_size);
networkCloned.getFunction(), L3_cache_size);
if (memPressure.max_mem_tolerance > 8*ov::MemBandwidthPressure::LIMITED) {
batch = 32;
} else if (memPressure.max_mem_tolerance > 4*ov::MemBandwidthPressure::LIMITED) {

View File

@ -1150,11 +1150,12 @@ ExecutableNetwork Core::LoadNetwork(const CNNNetwork& network,
const std::string& deviceNameOrig,
const std::map<std::string, std::string>& config) {
auto deviceName = deviceNameOrig;
if (deviceNameOrig == "GPU") {
if (deviceNameOrig.find("GPU") != std::string::npos) {
std::map<std::string, Parameter> options;
options["MODEL_ADDRESS"] = &network;
auto optimalBatchSize =
_impl->GetCPPPluginByName(deviceNameOrig).get_metric(METRIC_KEY(OPTIMAL_BATCH), options).as<unsigned int>();
_impl->GetCPPPluginByName(DeviceIDParser(deviceName).getDeviceName()).
get_metric(METRIC_KEY(OPTIMAL_BATCH), options).as<unsigned int>();
auto function = network.getFunction();
bool bDetectionOutput = false;
for (auto&& node : function->get_ops()) {

View File

@ -500,11 +500,9 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
// the more "capable" the CPU in general, the more streams we may want to keep to keep it utilized
const float memThresholdAssumeLimitedForISA = ov::MemBandwidthPressure::LIMITED/isaSpecificThreshold;
const float L2_cache_size = mkldnn::utils::get_cache_size(2 /*level*/, true /*per core */);
const float L3_cache_size = mkldnn::utils::get_cache_size(3, false);
ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(
clonedNetwork.getFunction(),
L2_cache_size, L3_cache_size,
memThresholdAssumeLimitedForISA);
L2_cache_size, memThresholdAssumeLimitedForISA);
// num of phys CPU cores (most aggressive value for #streams)
const auto num_cores = getNumberOfCPUCores();
// less aggressive

View File

@ -23,13 +23,12 @@ struct MemBandwidthPressure {
MemBandwidthPressure MemBandwidthPressureTolerance(
const std::shared_ptr<ngraph::Function> nGraphFunc,
const float L2_cache_size,
const float L3_cache_size,
const float cache_size,
const float memThresholdAssumeLimited = MemBandwidthPressure::LIMITED) {
int total_convs = 0, mem_limited_convs = 0, compute_convs = 0, total_gemms = 0, mem_limited_gemms = 0,
total_deconvs = 0, compute_deconvs = 0, mem_limited_deconvs = 0;
auto memLimitedFactor = [&](int size_data_moved, int datatype_size = 4) -> float {
return (L2_cache_size * 1.0f /*util factor, tbd */
return (cache_size * 1.0f /*util factor, tbd */
/ (size_data_moved * datatype_size));
};
auto isLowPrecision = [&](ngraph::element::Type type) -> bool {