From 81685c8d212135dd9980da86a18db41fbf6d250a Mon Sep 17 00:00:00 2001 From: Maxim Shevtsov Date: Wed, 19 Jan 2022 14:05:13 +0300 Subject: [PATCH] Enabling auto batching for the GPU when tput hint is set (#9724) * moving the HETERO logic to the Auto-Batch (WIP), reverting to the ALLOW_AUTO_BATCHING and using that in the GPU remote tests * shortned the vars names in the ie_core and prevented recursive auto-batching calls by checking for exclusive requests and disabling further auto-batching in the plugin, when HETERO is involved * checking for the batch-dim presence (this is still WA until the https://github.com/openvinotoolkit/openvino/pull/9559 is merged) - pls see CVS-75317 +clang for the ie_core.cpp * moving the HETERO logic back to the ie_core.cpp, storing the _so internally for no-batch code-path --- src/inference/include/ie/ie_plugin_config.hpp | 4 + src/inference/src/ie_core.cpp | 99 ++++++++++++------- src/plugins/auto_batch/auto_batch.cpp | 76 +++++++++++++- src/plugins/auto_batch/auto_batch.hpp | 1 + .../cldnn_remote_blob_tests.cpp | 7 +- .../gpu_remote_tensor_tests.cpp | 7 +- 6 files changed, 152 insertions(+), 42 deletions(-) diff --git a/src/inference/include/ie/ie_plugin_config.hpp b/src/inference/include/ie/ie_plugin_config.hpp index 6e9c9afd1c3..8e1dcea702f 100644 --- a/src/inference/include/ie/ie_plugin_config.hpp +++ b/src/inference/include/ie/ie_plugin_config.hpp @@ -255,6 +255,10 @@ DECLARE_CONFIG_VALUE(THROUGHPUT); * usually this value comes from the actual use-case (e.g. number of video-cameras, or other sources of inputs) */ DECLARE_CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS); +/** + * @brief (Optional) config key that governs Auto-Batching (with YES/NO values, below) + */ +DECLARE_CONFIG_KEY(ALLOW_AUTO_BATCHING); /** * @brief generic boolean values diff --git a/src/inference/src/ie_core.cpp b/src/inference/src/ie_core.cpp index 8cfa02385f1..d0db8e2b0be 100644 --- a/src/inference/src/ie_core.cpp +++ b/src/inference/src/ie_core.cpp @@ -522,43 +522,74 @@ public: void ApplyAutoBatching(const ie::CNNNetwork& network, std::string& deviceName, - std::map& config_with_batch) { + std::map& config) { + std::string deviceNameWithBatchSize, deviceNameWithoutBatch; if (deviceName.find("BATCH") != std::string::npos) { - // explicitly enabled Auto-Batching e.g. in the tests + // explicitly enabled Auto-Batching auto pos = deviceName.find_first_of(":"); - if (pos != std::string::npos) { - auto deviceNameWithBatchSize = deviceName.substr(pos + 1); - auto deviceNameWithoutBatch = DeviceIDParser::getBatchDevice(deviceNameWithBatchSize); - auto function = network.getFunction(); - // have to execute the DetectionOutput separately (without batching) - // as this layer mix-in the values from the different inputs (batch id) - bool bDetectionOutput = false; - const std::string detectionOutputOpName = ngraph::op::DetectionOutput::get_type_info_static().name; - const std::string resultOpName = ngraph::op::Result::get_type_info_static().name; - for (auto&& node : function->get_ops()) { - auto isDetectionOutputParent = [&detectionOutputOpName](decltype(node)& nd) { - for (size_t n = 0; n < nd->get_input_size(); n++) { - if (detectionOutputOpName == nd->get_input_node_ptr(n)->get_type_info().name) - return true; - } - return false; - }; - - if ((detectionOutputOpName == node->get_type_info().name) || - ((resultOpName == node->get_type_info().name) && isDetectionOutputParent(node))) { - node->get_rt_info()["affinity"] = deviceNameWithoutBatch; - bDetectionOutput = true; - } else { - node->get_rt_info()["affinity"] = "BATCH"; - } - } - if (bDetectionOutput) { - deviceName = "HETERO:BATCH," + deviceNameWithoutBatch; - config_with_batch[CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)] = deviceNameWithBatchSize; - } else { - deviceName = "BATCH:" + deviceNameWithBatchSize; - } + if (pos == std::string::npos) + return; // BATCH device is already configured via the config + deviceNameWithBatchSize = deviceName.substr(pos + 1); + deviceNameWithoutBatch = DeviceIDParser::getBatchDevice(deviceNameWithBatchSize); + } else { + // check whether the Auto-Batching is disabled explicitly + const auto& batch_mode = config.find(CONFIG_KEY(ALLOW_AUTO_BATCHING)); + if (batch_mode != config.end()) { + const auto disabled = batch_mode->second == CONFIG_VALUE(NO); + // no need for this config key in the rest of loading + config.erase(batch_mode); + if (disabled) + return; } + // check whether if the Auto-Batching is applicable to the device + auto device = ov::runtime::parseDeviceNameIntoConfig(deviceName); + deviceNameWithoutBatch = deviceName; + auto d = device._deviceName; + std::vector metrics = GetCPPPluginByName(d).get_metric(METRIC_KEY(SUPPORTED_METRICS), {}); + auto it = std::find(metrics.begin(), metrics.end(), METRIC_KEY(OPTIMAL_BATCH_SIZE)); + if (metrics.end() == it) + return; + // if applicable, the Auto-Batching is implicitly enabled via the performance hints + bool bTputInPlg = GetConfig(d, CONFIG_KEY(PERFORMANCE_HINT)).as() == CONFIG_VALUE(THROUGHPUT); + const auto& mode = config.find(CONFIG_KEY(PERFORMANCE_HINT)); + bool bTputInLoadCfg = (mode != config.end() && mode->second == CONFIG_VALUE(THROUGHPUT)); + const auto& excl = config.find(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)); + bool bExclReqsEnabled = (excl != config.end() && excl->second == CONFIG_VALUE(YES)); + if (bExclReqsEnabled || (!bTputInPlg && !bTputInLoadCfg)) + return; + } + auto function = network.getFunction(); + // have to execute the DetectionOutput separately (without batching) + // as this layer mix-in the values from the different inputs (batch id) + bool bDetectionOutput = false; + const std::string detectionOutputOpName = ngraph::op::DetectionOutput::get_type_info_static().name; + const std::string resultOpName = ngraph::op::Result::get_type_info_static().name; + for (auto&& node : function->get_ops()) { + auto isDetectionOutputParent = [&detectionOutputOpName](decltype(node)& nd) { + for (size_t n = 0; n < nd->get_input_size(); n++) { + // the code below doesn't need to separate the versions (opsets) of the DetectionOutput + // so type_info name check is enough + // (if in a future there will be a new ver that doesn't mix the batch, this will be new op) + if (detectionOutputOpName == nd->get_input_node_ptr(n)->get_type_info().name) + return true; + } + return false; + }; + + if ((detectionOutputOpName == node->get_type_info().name) || + ((resultOpName == node->get_type_info().name) && isDetectionOutputParent(node))) { + node->get_rt_info()["affinity"] = deviceNameWithoutBatch; + bDetectionOutput = true; + } else { + node->get_rt_info()["affinity"] = "BATCH"; + } + } + auto batchConfig = deviceNameWithBatchSize.empty() ? deviceNameWithoutBatch : deviceNameWithBatchSize; + if (bDetectionOutput) { + deviceName = "HETERO:BATCH," + deviceNameWithoutBatch; + config[CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)] = batchConfig; + } else { + deviceName = "BATCH:" + batchConfig; } } diff --git a/src/plugins/auto_batch/auto_batch.cpp b/src/plugins/auto_batch/auto_batch.cpp index 20a0b5ee694..a1809984a8a 100644 --- a/src/plugins/auto_batch/auto_batch.cpp +++ b/src/plugins/auto_batch/auto_batch.cpp @@ -536,7 +536,7 @@ DeviceInformation AutoBatchInferencePlugin::ParseBatchDevice(const std::string& auto closingBracket = d.find_first_of(')', openingBracket); auto deviceName = d.substr(0, openingBracket); - int batch = 1; + int batch = 0; if (closingBracket != std::string::npos && openingBracket < closingBracket) { batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1)); @@ -681,6 +681,72 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig); const auto& deviceName = metaDevice.deviceName; const auto& deviceConfig = metaDevice.config; + auto config_without_autobatch = config, deviceConfigNoAutoBatch = deviceConfig; + // avoid recursive auto-batching + config_without_autobatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO); + deviceConfigNoAutoBatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO); + + auto function = network.getFunction(); + // check that the auto-batching is applicable in general + try { + // do not reshape/re-batch originally batched networks and when there are no inputs with the N* layouts + // the below code is a placeholder for the WIP (22.1) functionality + // that will check the reshaping by the batch is robust (CVS-51744) + const InputsDataMap inputInfo = network.getInputsInfo(); + bool atLeastOneInputIsBatched = false; + for (const InputsDataMap::value_type& item : inputInfo) { + auto layout = item.second->getTensorDesc().getLayout(); + if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW || + layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC || + layout == InferenceEngine::Layout::NDHWC) { + if (1 != item.second->getTensorDesc().getDims()[0]) // do not reshape/re-batch batched networks + IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!"; + else + atLeastOneInputIsBatched = true; + } + } + bool atLeastOneOutputIsBatched = false; + const OutputsDataMap outputInfo = network.getOutputsInfo(); + for (const OutputsDataMap::value_type& item : outputInfo) { + auto layout = item.second->getTensorDesc().getLayout(); + if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW || + layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC || + layout == InferenceEngine::Layout::NDHWC) { + if (1 != item.second->getTensorDesc().getDims()[0]) // do not reshape/re-batch batched networks + IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!"; + else + atLeastOneOutputIsBatched = true; + } + } + if (!atLeastOneInputIsBatched || !atLeastOneOutputIsBatched) + IE_THROW(NotImplemented) + << "Auto-batching supports only networks featuring inputs/outputs with the batched layouts !"; + } catch (...) { + // fallback to loading as if no Auto-Batching was involved + auto res = GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch); + _additionalSOPtrs.push_back(res._so); + return res._ptr; + } + + if (!metaDevice.batchForDevice) { + unsigned int requests = 0; + unsigned int optimalBatchSize = 0; + // batch size is not set explicitly via device name e.g. BATCH:GPU(4) + // let's query the optimal batch size + std::map options; + options["MODEL_PTR"] = std::const_pointer_cast(network.getFunction()); + auto optBatchSize = + GetCore()->GetMetric(deviceName, METRIC_KEY(OPTIMAL_BATCH_SIZE), options).as(); + auto res = GetCore()->GetConfig(deviceName, CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as(); + requests = PerfHintsConfig::CheckPerformanceHintRequestValue(res); + const auto& reqs = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)); + if (reqs != config.end()) + requests = static_cast(PerfHintsConfig::CheckPerformanceHintRequestValue(reqs->second)); + if (requests) + optBatchSize = std::max(1u, std::min(requests, optimalBatchSize)); + metaDevice.batchForDevice = optBatchSize; + } + const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT); const auto perfConfigInTargetPlugin = GetCore()->GetConfig(deviceName, PluginConfigParams::KEY_PERF_COUNT).as() == @@ -700,8 +766,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN size_t batch1_footprint = 0; if (deviceName.find("GPU") != std::string::npos) batch1_footprint = report_footprint(GetCore(), deviceName); - auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfig) - : GetCore()->LoadNetwork(network, deviceName, deviceConfig); + auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfigNoAutoBatch) + : GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch); if (deviceName.find("GPU") != std::string::npos) { batch1_footprint = report_footprint(GetCore(), deviceName) - batch1_footprint; if (batch1_footprint) { @@ -738,8 +804,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadN } clonedNetwork.reshape(shapes); executableNetworkWithBatch = - ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfig) - : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig); + ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfigNoAutoBatch) + : GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfigNoAutoBatch); } catch (...) { executableNetworkWithBatch = {nullptr, nullptr}; } diff --git a/src/plugins/auto_batch/auto_batch.hpp b/src/plugins/auto_batch/auto_batch.hpp index 808522fd447..a1d4aec347e 100644 --- a/src/plugins/auto_batch/auto_batch.hpp +++ b/src/plugins/auto_batch/auto_batch.hpp @@ -168,6 +168,7 @@ protected: const InferenceEngine::CNNNetwork& network, const std::shared_ptr context, const std::map& config); + std::vector> _additionalSOPtrs; }; } // namespace AutoBatchPlugin diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp index 0647f1e7fab..c3f30667002 100644 --- a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp +++ b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp @@ -25,6 +25,7 @@ class RemoteBlob_Test : public CommonTestUtils::TestsCommon, public testing::Wit protected: std::shared_ptr fn_ptr; std::string deviceName; + std::map config; public: void SetUp() override { @@ -33,6 +34,7 @@ public: auto with_auto_batching = this->GetParam(); if (with_auto_batching) { // BATCH:GPU deviceName = std::string(CommonTestUtils::DEVICE_BATCH) + ":" + deviceName; + config = {{CONFIG_KEY(ALLOW_AUTO_BATCHING), CONFIG_VALUE(YES)}}; } } static std::string getTestCaseName(const testing::TestParamInfo& obj) { @@ -174,7 +176,10 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) { // inference using remote blob auto ocl_instance = std::make_shared(); auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_context.get()); - auto exec_net_shared = ie->LoadNetwork(net, remote_context); + // since there is no way to enable the Auto-Batching thru the device name when loading with the RemoteContext + // (as the device name is deduced from the context, which is the "GPU") + // the only-way to test the auto-batching is explicit config with ALLOW_AUTO_BATCHING set to YES + auto exec_net_shared = ie->LoadNetwork(net, remote_context, config); auto inf_req_shared = exec_net_shared.CreateInferRequest(); inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, fakeImageData); diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp index 2f9ba471094..fb474aae651 100644 --- a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp +++ b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp @@ -336,6 +336,8 @@ class OVRemoteTensor_TestsWithContext : public OVRemoteTensor_Test, public testi protected: std::shared_ptr fn_ptr; std::string deviceName; + std::map config; + public: void SetUp() override { fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat(); @@ -343,6 +345,7 @@ public: auto with_auto_batching = this->GetParam(); if (with_auto_batching) { // BATCH:GPU deviceName = std::string(CommonTestUtils::DEVICE_BATCH) + ":" + deviceName; + config = {{CONFIG_KEY(ALLOW_AUTO_BATCHING), CONFIG_VALUE(YES)}}; } } static std::string getTestCaseName(const testing::TestParamInfo& obj) { @@ -376,7 +379,7 @@ TEST_P(OVRemoteTensor_TestsWithContext, smoke_canInferOnUserContext) { auto ocl_instance = std::make_shared(); auto remote_context = ov::runtime::intel_gpu::ocl::ClContext(ie, ocl_instance->_context.get()); - auto exec_net_shared = ie.compile_model(function, remote_context); + auto exec_net_shared = ie.compile_model(function, remote_context, config); auto inf_req_shared = exec_net_shared.create_infer_request(); inf_req_shared.set_tensor(input, fakeImageData); @@ -424,7 +427,7 @@ TEST_P(OVRemoteTensor_TestsWithContext, smoke_canInferOnUserContextWithMultipleD auto remote_context = ov::runtime::intel_gpu::ocl::ClContext(ie, ocl_instance->_context.get(), 1); ASSERT_EQ(remote_context.get_device_name(), "GPU.0"); - auto exec_net_shared = ie.compile_model(function, remote_context); + auto exec_net_shared = ie.compile_model(function, remote_context, config); auto inf_req_shared = exec_net_shared.create_infer_request(); inf_req_shared.set_tensor(input, fakeImageData);