experimenting with DG1 on the batch size selection, also collecting the mem footprint

2021-10-07 11:43:15 +03:00 · 2021-10-07 11:43:15 +03:00 · ac21d71321
commit ac21d71321
parent e7b743ac33
4 changed files with 8 additions and 12 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -681,13 +681,11 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
    } else if (name == METRIC_KEY(OPTIMAL_BATCH)) {
        auto network = options.find("MODEL_ADDRESS")->second.as<InferenceEngine::CNNNetwork const*>();
        auto networkCloned = CloneAndTransformNetwork(*network, _impl->m_config);
-        // i7_1185G7
-        const float L2_cache_size = 6*1024*1024;
+        // DG1
        const float L3_cache_size = 12*1024*1024;
        unsigned int batch = 1;
        ov::MemBandwidthPressure memPressure = ov::MemBandwidthPressureTolerance(
-                networkCloned.getFunction(),
-                L2_cache_size, L3_cache_size);
+                networkCloned.getFunction(), L3_cache_size);
        if (memPressure.max_mem_tolerance > 8*ov::MemBandwidthPressure::LIMITED) {
            batch = 32;
        } else if (memPressure.max_mem_tolerance > 4*ov::MemBandwidthPressure::LIMITED) {
--- a/inference-engine/src/inference_engine/src/ie_core.cpp
+++ b/inference-engine/src/inference_engine/src/ie_core.cpp
@ -1150,11 +1150,12 @@ ExecutableNetwork Core::LoadNetwork(const CNNNetwork& network,
                                    const std::string& deviceNameOrig,
                                    const std::map<std::string, std::string>& config) {
    auto deviceName = deviceNameOrig;
-    if (deviceNameOrig == "GPU") {
+    if (deviceNameOrig.find("GPU") != std::string::npos) {
        std::map<std::string, Parameter> options;
        options["MODEL_ADDRESS"] = &network;
        auto optimalBatchSize =
-            _impl->GetCPPPluginByName(deviceNameOrig).get_metric(METRIC_KEY(OPTIMAL_BATCH), options).as<unsigned int>();
+            _impl->GetCPPPluginByName(DeviceIDParser(deviceName).getDeviceName()).
+                    get_metric(METRIC_KEY(OPTIMAL_BATCH), options).as<unsigned int>();
        auto function = network.getFunction();
        bool bDetectionOutput = false;
        for (auto&& node : function->get_ops()) {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@ -500,11 +500,9 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
                // the more "capable" the CPU in general, the more streams we may want to keep to keep it utilized
                const float memThresholdAssumeLimitedForISA = ov::MemBandwidthPressure::LIMITED/isaSpecificThreshold;
                const float L2_cache_size = mkldnn::utils::get_cache_size(2 /*level*/, true /*per core */);
-                const float L3_cache_size = mkldnn::utils::get_cache_size(3, false);
                ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(
                        clonedNetwork.getFunction(),
-                        L2_cache_size, L3_cache_size,
-                        memThresholdAssumeLimitedForISA);
+                        L2_cache_size, memThresholdAssumeLimitedForISA);
                // num of phys CPU cores (most aggressive value for #streams)
                const auto num_cores = getNumberOfCPUCores();
                // less aggressive
--- a/inference-engine/src/plugin_api/performance_heuristics.hpp
+++ b/inference-engine/src/plugin_api/performance_heuristics.hpp
@ -23,13 +23,12 @@ struct MemBandwidthPressure {

 MemBandwidthPressure MemBandwidthPressureTolerance(
    const std::shared_ptr<ngraph::Function> nGraphFunc,
-    const float L2_cache_size,
-    const float L3_cache_size,
+    const float cache_size,
    const float memThresholdAssumeLimited = MemBandwidthPressure::LIMITED) {
    int total_convs = 0, mem_limited_convs = 0, compute_convs = 0, total_gemms = 0, mem_limited_gemms = 0,
        total_deconvs = 0, compute_deconvs = 0, mem_limited_deconvs = 0;
    auto memLimitedFactor = [&](int size_data_moved, int datatype_size = 4) -> float {
-        return (L2_cache_size * 1.0f /*util factor, tbd */
+        return (cache_size * 1.0f /*util factor, tbd */
                / (size_data_moved * datatype_size));
    };
    auto isLowPrecision = [&](ngraph::element::Type type) -> bool {