debugging DG1 perf drop (presumably due to non-fitting the device-mem)

2021-12-01 18:13:01 +03:00 · 2021-12-01 18:13:01 +03:00 · b52768c2cc
commit b52768c2cc
parent 5834de7f67
4 changed files with 27 additions and 3 deletions
--- a/inference-engine/src/auto_batch/auto_batch.cpp
+++ b/inference-engine/src/auto_batch/auto_batch.cpp
@ -303,7 +303,7 @@ InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateIn
                                t.first->_inferRequest->CopyInputsIfNeeded();
                            }
                            workerRequestPtr->_inferRequest->StartAsync();
-                            std::cout << "BATCH" << std::endl;
+                            // std::cout << "BATCH" << std::endl;
                        } else if ((status == std::cv_status::timeout) && sz) {
                            // timeout to collect the batch is over, have to execute the requests in the batch1 mode
                            auto start = std::chrono::high_resolution_clock::now();
@ -549,15 +549,27 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
    const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
    const bool enablePerfCounters = (fullConfig.end() != perfConfig) && (perfConfig->second == PluginConfigParams::YES);

+    auto report_footprint = [] (std::shared_ptr<ICore> pCore, std::string device, std::string message) -> size_t {
+        size_t footprint  = 0;
+        const auto stats = pCore->GetMetric(device, GPU_METRIC_KEY(MEMORY_STATISTICS)).as<std::map<std::string, uint64_t>>();
+        for (auto s : stats)
+            footprint += s.second;
+        std::cout << "!!!!!!!!!!!!!! (FOOTPRINT) " << message << " : " << footprint/1024 << " MB" << std::endl;
+        return  footprint;
+    };
+
+    if (deviceName.find("GPU") != std::string::npos)
+        report_footprint(GetCore(), deviceName, "Before Batch1");
    auto executableNetworkWithoutBatch = ctx
            ? GetCore()->LoadNetwork(network, ctx, deviceConfig)
            : GetCore()->LoadNetwork(network, deviceName, deviceConfig);
+    if (deviceName.find("GPU") != std::string::npos)
+        report_footprint(GetCore(), deviceName, "After Batch1");
    // device settings + auto-batch settings
    std::unordered_map<std::string, InferenceEngine::Parameter> networkConfig;
    networkConfig.insert(*device_batch);
    networkConfig.insert(deviceConfig.begin(), deviceConfig.end());

-    // TODO: remove this experimental code that does loop rather than use the batch1 footprint only
    InferenceEngine::SoExecutableNetworkInternal executableNetworkWithBatch;
    do {
        try {
@ -582,6 +594,14 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
            executableNetworkWithBatch = ctx
                                         ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfig)
                                         : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig);
+            if (deviceName.find("GPU") != std::string::npos) {
+                const uint64_t total_mem = GetCore()->GetMetric(deviceName, GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE));
+                const size_t footprint = report_footprint(GetCore(), deviceName, "After BATCHED");
+                if (footprint > total_mem) {  // WA for inaccurate footprint estimations
+                    std::cout << "!!!! Total on-device mem is " << total_mem << " less than :" << footprint << std::endl;
+                    throw NETWORK_NOT_LOADED;
+                }
+            }
        } catch (...) {
            // reload the network with smaller batch
            executableNetworkWithBatch = {nullptr, nullptr};
--- a/inference-engine/src/cldnn_engine/cldnn_config.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@ -250,7 +250,7 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            }
        } else if (key.compare(PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS) == 0) {
            if (val.compare(PluginConfigParams::GPU_THROUGHPUT_AUTO) == 0) {
-                throughput_streams = 2;
+                throughput_streams = default_num_streams_for_tput;
            } else {
                int val_i;
                try {
--- a/inference-engine/src/cldnn_engine/cldnn_config.h
+++ b/inference-engine/src/cldnn_engine/cldnn_config.h
@ -14,6 +14,9 @@

 namespace CLDNNPlugin {

+// fixme: this value should be deduced from the #command-streamers, and presumably queried from the plugin
+const uint32_t default_num_streams_for_tput = 2;
+
 struct Config {
    Config(std::string device_id = "0") : device_id(device_id),
                                          throughput_streams(1),
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -726,6 +726,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
        std::cout << "SELECTED BATCH: " << batch << std::endl;
        std::map<std::string, InferenceEngine::Parameter> options_for_max_batch;
        options_for_max_batch["MODEL_PTR"] = std::const_pointer_cast<ngraph::Function>(network->getFunction());
+        options_for_max_batch["GPU_THROUGHPUT_STREAMS"] = static_cast<uint32_t>(default_num_streams_for_tput);
        auto max_batch_size = GetMetric(GPU_METRIC_KEY(MAX_BATCH_SIZE), options_for_max_batch).as<unsigned int>();
        std::cout << "MAX_BATCH: " << max_batch_size << std::endl;
        unsigned int closest = pow(2, floor(log(max_batch_size)/log(2)));