From 81685c8d212135dd9980da86a18db41fbf6d250a Mon Sep 17 00:00:00 2001
From: Maxim Shevtsov <maxim.y.shevtsov@intel.com>
Date: Wed, 19 Jan 2022 14:05:13 +0300
Subject: [PATCH] Enabling auto batching for the GPU when tput hint is set
 (#9724)

* moving the HETERO logic to the Auto-Batch (WIP), reverting to the ALLOW_AUTO_BATCHING and using that in the GPU remote tests

* shortned the vars names in the ie_core and prevented recursive auto-batching calls by checking for exclusive requests and disabling further auto-batching in the plugin, when HETERO is involved

* checking for the batch-dim presence (this is still WA until the https://github.com/openvinotoolkit/openvino/pull/9559 is merged) - pls see CVS-75317
+clang for the ie_core.cpp

* moving the HETERO logic back to the ie_core.cpp, storing the _so internally for no-batch code-path
---
 src/inference/include/ie/ie_plugin_config.hpp |  4 +
 src/inference/src/ie_core.cpp                 | 99 ++++++++++++-------
 src/plugins/auto_batch/auto_batch.cpp         | 76 +++++++++++++-
 src/plugins/auto_batch/auto_batch.hpp         |  1 +
 .../cldnn_remote_blob_tests.cpp               |  7 +-
 .../gpu_remote_tensor_tests.cpp               |  7 +-
 6 files changed, 152 insertions(+), 42 deletions(-)

diff --git a/src/inference/include/ie/ie_plugin_config.hpp b/src/inference/include/ie/ie_plugin_config.hpp
index 6e9c9afd1c3..8e1dcea702f 100644
--- a/src/inference/include/ie/ie_plugin_config.hpp
+++ b/src/inference/include/ie/ie_plugin_config.hpp
@@ -255,6 +255,10 @@ DECLARE_CONFIG_VALUE(THROUGHPUT);
  * usually this value comes from the actual use-case (e.g. number of video-cameras, or other sources of inputs)
  */
 DECLARE_CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS);
+/**
+ * @brief (Optional) config key that governs Auto-Batching (with YES/NO values, below)
+ */
+DECLARE_CONFIG_KEY(ALLOW_AUTO_BATCHING);
 
 /**
  * @brief generic boolean values
diff --git a/src/inference/src/ie_core.cpp b/src/inference/src/ie_core.cpp
index 8cfa02385f1..d0db8e2b0be 100644
--- a/src/inference/src/ie_core.cpp
+++ b/src/inference/src/ie_core.cpp
@@ -522,43 +522,74 @@ public:
 
     void ApplyAutoBatching(const ie::CNNNetwork& network,
                            std::string& deviceName,
-                           std::map<std::string, std::string>& config_with_batch) {
+                           std::map<std::string, std::string>& config) {
+        std::string deviceNameWithBatchSize, deviceNameWithoutBatch;
         if (deviceName.find("BATCH") != std::string::npos) {
-            // explicitly enabled Auto-Batching e.g. in the tests
+            // explicitly enabled Auto-Batching
             auto pos = deviceName.find_first_of(":");
-            if (pos != std::string::npos) {
-                auto deviceNameWithBatchSize = deviceName.substr(pos + 1);
-                auto deviceNameWithoutBatch = DeviceIDParser::getBatchDevice(deviceNameWithBatchSize);
-                auto function = network.getFunction();
-                // have to execute the DetectionOutput separately (without batching)
-                // as this layer mix-in the values from the different inputs (batch id)
-                bool bDetectionOutput = false;
-                const std::string detectionOutputOpName = ngraph::op::DetectionOutput::get_type_info_static().name;
-                const std::string resultOpName = ngraph::op::Result::get_type_info_static().name;
-                for (auto&& node : function->get_ops()) {
-                    auto isDetectionOutputParent = [&detectionOutputOpName](decltype(node)& nd) {
-                        for (size_t n = 0; n < nd->get_input_size(); n++) {
-                            if (detectionOutputOpName == nd->get_input_node_ptr(n)->get_type_info().name)
-                                return true;
-                        }
-                        return false;
-                    };
-
-                    if ((detectionOutputOpName == node->get_type_info().name) ||
-                        ((resultOpName == node->get_type_info().name) && isDetectionOutputParent(node))) {
-                        node->get_rt_info()["affinity"] = deviceNameWithoutBatch;
-                        bDetectionOutput = true;
-                    } else {
-                        node->get_rt_info()["affinity"] = "BATCH";
-                    }
-                }
-                if (bDetectionOutput) {
-                    deviceName = "HETERO:BATCH," + deviceNameWithoutBatch;
-                    config_with_batch[CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)] = deviceNameWithBatchSize;
-                } else {
-                    deviceName = "BATCH:" + deviceNameWithBatchSize;
-                }
+            if (pos == std::string::npos)
+                return;  // BATCH device is already configured via the config
+            deviceNameWithBatchSize = deviceName.substr(pos + 1);
+            deviceNameWithoutBatch = DeviceIDParser::getBatchDevice(deviceNameWithBatchSize);
+        } else {
+            // check whether the Auto-Batching is disabled explicitly
+            const auto& batch_mode = config.find(CONFIG_KEY(ALLOW_AUTO_BATCHING));
+            if (batch_mode != config.end()) {
+                const auto disabled = batch_mode->second == CONFIG_VALUE(NO);
+                // no need for this config key in the rest of loading
+                config.erase(batch_mode);
+                if (disabled)
+                    return;
             }
+            // check whether if the Auto-Batching is applicable to the device
+            auto device = ov::runtime::parseDeviceNameIntoConfig(deviceName);
+            deviceNameWithoutBatch = deviceName;
+            auto d = device._deviceName;
+            std::vector<std::string> metrics = GetCPPPluginByName(d).get_metric(METRIC_KEY(SUPPORTED_METRICS), {});
+            auto it = std::find(metrics.begin(), metrics.end(), METRIC_KEY(OPTIMAL_BATCH_SIZE));
+            if (metrics.end() == it)
+                return;
+            // if applicable, the Auto-Batching is implicitly enabled via the performance hints
+            bool bTputInPlg = GetConfig(d, CONFIG_KEY(PERFORMANCE_HINT)).as<std::string>() == CONFIG_VALUE(THROUGHPUT);
+            const auto& mode = config.find(CONFIG_KEY(PERFORMANCE_HINT));
+            bool bTputInLoadCfg = (mode != config.end() && mode->second == CONFIG_VALUE(THROUGHPUT));
+            const auto& excl = config.find(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS));
+            bool bExclReqsEnabled = (excl != config.end() && excl->second == CONFIG_VALUE(YES));
+            if (bExclReqsEnabled || (!bTputInPlg && !bTputInLoadCfg))
+                return;
+        }
+        auto function = network.getFunction();
+        // have to execute the DetectionOutput separately (without batching)
+        // as this layer mix-in the values from the different inputs (batch id)
+        bool bDetectionOutput = false;
+        const std::string detectionOutputOpName = ngraph::op::DetectionOutput::get_type_info_static().name;
+        const std::string resultOpName = ngraph::op::Result::get_type_info_static().name;
+        for (auto&& node : function->get_ops()) {
+            auto isDetectionOutputParent = [&detectionOutputOpName](decltype(node)& nd) {
+                for (size_t n = 0; n < nd->get_input_size(); n++) {
+                    // the code below doesn't need to separate the versions (opsets) of the DetectionOutput
+                    // so type_info name check is enough
+                    // (if in a future there will be a new ver that doesn't mix the batch, this will be new op)
+                    if (detectionOutputOpName == nd->get_input_node_ptr(n)->get_type_info().name)
+                        return true;
+                }
+                return false;
+            };
+
+            if ((detectionOutputOpName == node->get_type_info().name) ||
+                ((resultOpName == node->get_type_info().name) && isDetectionOutputParent(node))) {
+                node->get_rt_info()["affinity"] = deviceNameWithoutBatch;
+                bDetectionOutput = true;
+            } else {
+                node->get_rt_info()["affinity"] = "BATCH";
+            }
+        }
+        auto batchConfig = deviceNameWithBatchSize.empty() ? deviceNameWithoutBatch : deviceNameWithBatchSize;
+        if (bDetectionOutput) {
+            deviceName = "HETERO:BATCH," + deviceNameWithoutBatch;
+            config[CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)] = batchConfig;
+        } else {
+            deviceName = "BATCH:" + batchConfig;
         }
     }
 
diff --git a/src/plugins/auto_batch/auto_batch.cpp b/src/plugins/auto_batch/auto_batch.cpp
index 20a0b5ee694..a1809984a8a 100644
--- a/src/plugins/auto_batch/auto_batch.cpp
+++ b/src/plugins/auto_batch/auto_batch.cpp
@@ -536,7 +536,7 @@ DeviceInformation AutoBatchInferencePlugin::ParseBatchDevice(const std::string&
     auto closingBracket = d.find_first_of(')', openingBracket);
     auto deviceName = d.substr(0, openingBracket);
 
-    int batch = 1;
+    int batch = 0;
     if (closingBracket != std::string::npos && openingBracket < closingBracket) {
         batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1));
 
@@ -681,6 +681,72 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
     auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig);
     const auto& deviceName = metaDevice.deviceName;
     const auto& deviceConfig = metaDevice.config;
+    auto config_without_autobatch = config, deviceConfigNoAutoBatch = deviceConfig;
+    // avoid recursive auto-batching
+    config_without_autobatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO);
+    deviceConfigNoAutoBatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO);
+
+    auto function = network.getFunction();
+    // check that the auto-batching is applicable in general
+    try {
+        // do not reshape/re-batch originally batched networks and when there are no inputs with the N* layouts
+        // the below code is a placeholder for the WIP (22.1) functionality
+        // that will check the reshaping by the batch is robust (CVS-51744)
+        const InputsDataMap inputInfo = network.getInputsInfo();
+        bool atLeastOneInputIsBatched = false;
+        for (const InputsDataMap::value_type& item : inputInfo) {
+            auto layout = item.second->getTensorDesc().getLayout();
+            if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
+                layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
+                layout == InferenceEngine::Layout::NDHWC) {
+                if (1 != item.second->getTensorDesc().getDims()[0])  // do not reshape/re-batch batched networks
+                    IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!";
+                else
+                    atLeastOneInputIsBatched = true;
+            }
+        }
+        bool atLeastOneOutputIsBatched = false;
+        const OutputsDataMap outputInfo = network.getOutputsInfo();
+        for (const OutputsDataMap::value_type& item : outputInfo) {
+            auto layout = item.second->getTensorDesc().getLayout();
+            if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
+                layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
+                layout == InferenceEngine::Layout::NDHWC) {
+                if (1 != item.second->getTensorDesc().getDims()[0])  // do not reshape/re-batch batched networks
+                    IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!";
+                else
+                    atLeastOneOutputIsBatched = true;
+            }
+        }
+        if (!atLeastOneInputIsBatched || !atLeastOneOutputIsBatched)
+            IE_THROW(NotImplemented)
+                << "Auto-batching supports only networks featuring inputs/outputs with the batched layouts !";
+    } catch (...) {
+        // fallback to loading as if no Auto-Batching was involved
+        auto res = GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch);
+        _additionalSOPtrs.push_back(res._so);
+        return res._ptr;
+    }
+
+    if (!metaDevice.batchForDevice) {
+        unsigned int requests = 0;
+        unsigned int optimalBatchSize = 0;
+        // batch size is not set explicitly via device name e.g. BATCH:GPU(4)
+        // let's query the optimal batch size
+        std::map<std::string, InferenceEngine::Parameter> options;
+        options["MODEL_PTR"] = std::const_pointer_cast<ngraph::Function>(network.getFunction());
+        auto optBatchSize =
+            GetCore()->GetMetric(deviceName, METRIC_KEY(OPTIMAL_BATCH_SIZE), options).as<unsigned int>();
+        auto res = GetCore()->GetConfig(deviceName, CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as<std::string>();
+        requests = PerfHintsConfig::CheckPerformanceHintRequestValue(res);
+        const auto& reqs = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS));
+        if (reqs != config.end())
+            requests = static_cast<unsigned int>(PerfHintsConfig::CheckPerformanceHintRequestValue(reqs->second));
+        if (requests)
+            optBatchSize = std::max(1u, std::min(requests, optimalBatchSize));
+        metaDevice.batchForDevice = optBatchSize;
+    }
+
     const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
     const auto perfConfigInTargetPlugin =
         GetCore()->GetConfig(deviceName, PluginConfigParams::KEY_PERF_COUNT).as<std::string>() ==
@@ -700,8 +766,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
     size_t batch1_footprint = 0;
     if (deviceName.find("GPU") != std::string::npos)
         batch1_footprint = report_footprint(GetCore(), deviceName);
-    auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfig)
-                                             : GetCore()->LoadNetwork(network, deviceName, deviceConfig);
+    auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfigNoAutoBatch)
+                                             : GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch);
     if (deviceName.find("GPU") != std::string::npos) {
         batch1_footprint = report_footprint(GetCore(), deviceName) - batch1_footprint;
         if (batch1_footprint) {
@@ -738,8 +804,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN
             }
             clonedNetwork.reshape(shapes);
             executableNetworkWithBatch =
-                ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfig)
-                    : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig);
+                ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfigNoAutoBatch)
+                    : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfigNoAutoBatch);
         } catch (...) {
             executableNetworkWithBatch = {nullptr, nullptr};
         }
diff --git a/src/plugins/auto_batch/auto_batch.hpp b/src/plugins/auto_batch/auto_batch.hpp
index 808522fd447..a1d4aec347e 100644
--- a/src/plugins/auto_batch/auto_batch.hpp
+++ b/src/plugins/auto_batch/auto_batch.hpp
@@ -168,6 +168,7 @@ protected:
         const InferenceEngine::CNNNetwork& network,
         const std::shared_ptr<InferenceEngine::RemoteContext> context,
         const std::map<std::string, std::string>& config);
+    std::vector<std::shared_ptr<void>> _additionalSOPtrs;
 };
 
 }  // namespace AutoBatchPlugin
diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
index 0647f1e7fab..c3f30667002 100644
--- a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@@ -25,6 +25,7 @@ class RemoteBlob_Test : public CommonTestUtils::TestsCommon, public testing::Wit
 protected:
     std::shared_ptr<ngraph::Function> fn_ptr;
     std::string deviceName;
+    std::map<std::string, std::string> config;
 
 public:
     void SetUp() override {
@@ -33,6 +34,7 @@ public:
         auto with_auto_batching = this->GetParam();
         if (with_auto_batching) { // BATCH:GPU
             deviceName = std::string(CommonTestUtils::DEVICE_BATCH) + ":" + deviceName;
+            config = {{CONFIG_KEY(ALLOW_AUTO_BATCHING), CONFIG_VALUE(YES)}};
         }
     }
     static std::string getTestCaseName(const testing::TestParamInfo<bool>& obj) {
@@ -174,7 +176,10 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) {
     // inference using remote blob
     auto ocl_instance = std::make_shared<OpenCL>();
     auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_context.get());
-    auto exec_net_shared = ie->LoadNetwork(net, remote_context);
+    // since there is no way to enable the Auto-Batching thru the device name when loading with the RemoteContext
+    // (as the device name is deduced from the context, which is the "GPU")
+    // the only-way to test the auto-batching is explicit config with ALLOW_AUTO_BATCHING set to YES
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context, config);
     auto inf_req_shared = exec_net_shared.CreateInferRequest();
     inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
 
diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
index 2f9ba471094..fb474aae651 100644
--- a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
+++ b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
@@ -336,6 +336,8 @@ class OVRemoteTensor_TestsWithContext : public OVRemoteTensor_Test, public testi
 protected:
     std::shared_ptr<ngraph::Function> fn_ptr;
     std::string deviceName;
+    std::map<std::string, std::string> config;
+
 public:
     void SetUp() override {
         fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
@@ -343,6 +345,7 @@ public:
         auto with_auto_batching = this->GetParam();
         if (with_auto_batching) { // BATCH:GPU
             deviceName = std::string(CommonTestUtils::DEVICE_BATCH) + ":" + deviceName;
+            config = {{CONFIG_KEY(ALLOW_AUTO_BATCHING), CONFIG_VALUE(YES)}};
         }
     }
     static std::string getTestCaseName(const testing::TestParamInfo<bool>& obj) {
@@ -376,7 +379,7 @@ TEST_P(OVRemoteTensor_TestsWithContext, smoke_canInferOnUserContext) {
     auto ocl_instance = std::make_shared<OpenCL>();
 
     auto remote_context = ov::runtime::intel_gpu::ocl::ClContext(ie, ocl_instance->_context.get());
-    auto exec_net_shared = ie.compile_model(function, remote_context);
+    auto exec_net_shared = ie.compile_model(function, remote_context, config);
     auto inf_req_shared = exec_net_shared.create_infer_request();
     inf_req_shared.set_tensor(input, fakeImageData);
 
@@ -424,7 +427,7 @@ TEST_P(OVRemoteTensor_TestsWithContext, smoke_canInferOnUserContextWithMultipleD
     auto remote_context = ov::runtime::intel_gpu::ocl::ClContext(ie, ocl_instance->_context.get(), 1);
 
     ASSERT_EQ(remote_context.get_device_name(), "GPU.0");
-    auto exec_net_shared = ie.compile_model(function, remote_context);
+    auto exec_net_shared = ie.compile_model(function, remote_context, config);
     auto inf_req_shared = exec_net_shared.create_infer_request();
     inf_req_shared.set_tensor(input, fakeImageData);