Enabling auto batching for the GPU when tput hint is set (#9724)

* moving the HETERO logic to the Auto-Batch (WIP), reverting to the ALLOW_AUTO_BATCHING and using that in the GPU remote tests * shortned the vars names in the ie_core and prevented recursive auto-batching calls by checking for exclusive requests and disabling further auto-batching in the plugin, when HETERO is involved * checking for the batch-dim presence (this is still WA until the https://github.com/openvinotoolkit/openvino/pull/9559 is merged) - pls see CVS-75317 +clang for the ie_core.cpp * moving the HETERO logic back to the ie_core.cpp, storing the _so internally for no-batch code-path
2022-01-19 14:05:13 +03:00
parent 59456efbfa
commit 81685c8d21
6 changed files with 152 additions and 42 deletions
--- a/src/plugins/auto_batch/auto_batch.cpp
+++ b/src/plugins/auto_batch/auto_batch.cpp
@@ -536,7 +536,7 @@ DeviceInformation AutoBatchInferencePlugin::ParseBatchDevice(const std::string&
    auto closingBracket = d.find_first_of(')', openingBracket);
    auto deviceName = d.substr(0, openingBracket);

-    int batch = 1;
+    int batch = 0;
    if (closingBracket != std::string::npos && openingBracket < closingBracket) {
        batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1));

@@ -681,6 +681,72 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
    auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig);
    const auto& deviceName = metaDevice.deviceName;
    const auto& deviceConfig = metaDevice.config;
+    auto config_without_autobatch = config, deviceConfigNoAutoBatch = deviceConfig;
+    // avoid recursive auto-batching
+    config_without_autobatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO);
+    deviceConfigNoAutoBatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO);
+
+    auto function = network.getFunction();
+    // check that the auto-batching is applicable in general
+    try {
+        // do not reshape/re-batch originally batched networks and when there are no inputs with the N* layouts
+        // the below code is a placeholder for the WIP (22.1) functionality
+        // that will check the reshaping by the batch is robust (CVS-51744)
+        const InputsDataMap inputInfo = network.getInputsInfo();
+        bool atLeastOneInputIsBatched = false;
+        for (const InputsDataMap::value_type& item : inputInfo) {
+            auto layout = item.second->getTensorDesc().getLayout();
+            if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
+                layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
+                layout == InferenceEngine::Layout::NDHWC) {
+                if (1 != item.second->getTensorDesc().getDims()[0])  // do not reshape/re-batch batched networks
+                    IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!";
+                else
+                    atLeastOneInputIsBatched = true;
+            }
+        }
+        bool atLeastOneOutputIsBatched = false;
+        const OutputsDataMap outputInfo = network.getOutputsInfo();
+        for (const OutputsDataMap::value_type& item : outputInfo) {
+            auto layout = item.second->getTensorDesc().getLayout();
+            if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
+                layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
+                layout == InferenceEngine::Layout::NDHWC) {
+                if (1 != item.second->getTensorDesc().getDims()[0])  // do not reshape/re-batch batched networks
+                    IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!";
+                else
+                    atLeastOneOutputIsBatched = true;
+            }
+        }
+        if (!atLeastOneInputIsBatched || !atLeastOneOutputIsBatched)
+            IE_THROW(NotImplemented)
+                << "Auto-batching supports only networks featuring inputs/outputs with the batched layouts !";
+    } catch (...) {
+        // fallback to loading as if no Auto-Batching was involved
+        auto res = GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch);
+        _additionalSOPtrs.push_back(res._so);
+        return res._ptr;
+    }
+
+    if (!metaDevice.batchForDevice) {
+        unsigned int requests = 0;
+        unsigned int optimalBatchSize = 0;
+        // batch size is not set explicitly via device name e.g. BATCH:GPU(4)
+        // let's query the optimal batch size
+        std::map<std::string, InferenceEngine::Parameter> options;
+        options["MODEL_PTR"] = std::const_pointer_cast<ngraph::Function>(network.getFunction());
+        auto optBatchSize =
+            GetCore()->GetMetric(deviceName, METRIC_KEY(OPTIMAL_BATCH_SIZE), options).as<unsigned int>();
+        auto res = GetCore()->GetConfig(deviceName, CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as<std::string>();
+        requests = PerfHintsConfig::CheckPerformanceHintRequestValue(res);
+        const auto& reqs = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS));
+        if (reqs != config.end())
+            requests = static_cast<unsigned int>(PerfHintsConfig::CheckPerformanceHintRequestValue(reqs->second));
+        if (requests)
+            optBatchSize = std::max(1u, std::min(requests, optimalBatchSize));
+        metaDevice.batchForDevice = optBatchSize;
+    }
+
    const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
    const auto perfConfigInTargetPlugin =
        GetCore()->GetConfig(deviceName, PluginConfigParams::KEY_PERF_COUNT).as<std::string>() ==
@@ -700,8 +766,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
    size_t batch1_footprint = 0;
    if (deviceName.find("GPU") != std::string::npos)
        batch1_footprint = report_footprint(GetCore(), deviceName);
-    auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfig)
-                                             : GetCore()->LoadNetwork(network, deviceName, deviceConfig);
+    auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfigNoAutoBatch)
+                                             : GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch);
    if (deviceName.find("GPU") != std::string::npos) {
        batch1_footprint = report_footprint(GetCore(), deviceName) - batch1_footprint;
        if (batch1_footprint) {
@@ -738,8 +804,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
            }
            clonedNetwork.reshape(shapes);
            executableNetworkWithBatch =
-                ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfig)
-                    : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig);
+                ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfigNoAutoBatch)
+                    : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfigNoAutoBatch);
        } catch (...) {
            executableNetworkWithBatch = {nullptr, nullptr};
        }
--- a/src/plugins/auto_batch/auto_batch.hpp
+++ b/src/plugins/auto_batch/auto_batch.hpp
@@ -168,6 +168,7 @@ protected:
        const InferenceEngine::CNNNetwork& network,
        const std::shared_ptr<InferenceEngine::RemoteContext> context,
        const std::map<std::string, std::string>& config);
+    std::vector<std::shared_ptr<void>> _additionalSOPtrs;
 };

 }  // namespace AutoBatchPlugin