// Copyright (C) 2018-2022 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // /////////////////////////////////////////////////////////////////////////////////////////////////// #include "auto_batch.hpp" #include #include #include #include #include #include #include #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "dimension_tracker.hpp" #include "ie_icore.hpp" #include "ie_ngraph_utils.hpp" #include "ie_performance_hints.hpp" #include "openvino/pass/manager.hpp" #include "openvino/runtime/intel_gpu/properties.hpp" #include "transformations/common_optimizations/dimension_tracking.hpp" #include "transformations/init_node_info.hpp" #include "transformations/utils/utils.hpp" namespace AutoBatchPlugin { using namespace InferenceEngine; std::vector supported_configKeys = {CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG), CONFIG_KEY(AUTO_BATCH_TIMEOUT), CONFIG_KEY(CACHE_DIR)}; template Blob::Ptr create_shared_blob_on_top_of_batched_blob(Blob::Ptr batched_blob, std::string name, const std::set& batched_names, size_t batch_id, size_t batch_num) { typedef typename PrecisionTrait::value_type TYPE; typedef typename std::add_pointer::type TYPEPTR; auto ptr = batched_blob->buffer().as(); auto sizePerBatch = batched_blob->size() / batch_num; SizeVector dims = batched_blob->getTensorDesc().getDims(); // for performance reason (copy avoidance) current impl of the auto-batching supports only batching by 0th dim if (batched_names.count(name)) { dims[0] = 1; return make_shared_blob({precision, dims, batched_blob->getTensorDesc().getLayout()}, ptr + sizePerBatch * batch_id, sizePerBatch); } else { // same blob for all requests (e.g. constants) return make_shared_blob({precision, dims, batched_blob->getTensorDesc().getLayout()}, ptr); } } // ------------------------------AutoBatchInferRequest---------------------------- AutoBatchInferRequest::AutoBatchInferRequest(const std::vector>& inputs, const std::vector>& outputs, AutoBatchExecutableNetwork::WorkerInferRequest& workerRequest, int batch_id, int num_batch, const std::set& batchedInputs, const std::set& batchedOutputs) : IInferRequestInternal(inputs, outputs), _myBatchedRequestWrapper(workerRequest), _batchId(batch_id), _batchSize(num_batch) { ShareBlobsWithBatchRequest(batchedInputs, batchedOutputs); } AutoBatchInferRequest::AutoBatchInferRequest(const InputsDataMap& networkInputs, const OutputsDataMap& networkOutputs, AutoBatchExecutableNetwork::WorkerInferRequest& workerRequest, int batch_id, int num_batch, const std::set& batchedInputs, const std::set& batchedOutputs) : IInferRequestInternal(networkInputs, networkOutputs), _myBatchedRequestWrapper(workerRequest), _batchId(batch_id), _batchSize(num_batch) { ShareBlobsWithBatchRequest(batchedInputs, batchedOutputs); } void AutoBatchInferRequest::ShareBlobsWithBatchRequest(const std::set& batchedInputs, const std::set& batchedOutputs) { // Allocate all input blobs for (const auto& it : _networkInputs) { auto blob = _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first); Blob::Ptr res; switch (it.second->getTensorDesc().getPrecision()) { case InferenceEngine::Precision::FP32: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I32: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I8: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U32: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::FP64: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::FP16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::BF16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U64: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I64: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U8: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; case InferenceEngine::Precision::BOOL: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedInputs, _batchId, _batchSize); break; default: IE_THROW() << "Unsupported input precision " << it.second->getTensorDesc().getPrecision(); } _inputs[it.first] = res; } // Allocate all output blobs for (const auto& it : _networkOutputs) { auto blob = _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first); Blob::Ptr res; switch (it.second->getTensorDesc().getPrecision()) { case InferenceEngine::Precision::FP32: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I32: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I8: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U32: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::FP64: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::FP16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::BF16: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U64: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::I64: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::U8: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; case InferenceEngine::Precision::BOOL: res = create_shared_blob_on_top_of_batched_blob( _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first), it.first, batchedOutputs, _batchId, _batchSize); break; default: IE_THROW(NotImplemented) << "Unsupported input precision " << it.second->getTensorDesc().getPrecision(); } _outputs[it.first] = res; } } void AutoBatchInferRequest::SetBlobsToAnotherRequest(SoIInferRequestInternal& req) { for (const auto& it : _networkInputs) { auto& name = it.first; // this request is already in BUSY state, so using the internal functions safely auto blob = GetBlob(name); if (req->GetBlob(name) != blob) req->SetBlob(name, blob); } for (const auto& it : _networkOutputs) { auto& name = it.first; // this request is already in BUSY state, so using the internal functions safely auto blob = GetBlob(name); if (req->GetBlob(name) != blob) req->SetBlob(name, blob); } } void AutoBatchInferRequest::CopyInputsIfNeeded() { for (const auto& it : _networkInputs) { auto& name = it.first; // this request is already in BUSY state, so using the internal functions safely CopyBlobIfNeeded(GetBlob(name), _myBatchedRequestWrapper._inferRequestBatched->GetBlob(name), true); } } void AutoBatchInferRequest::CopyBlobIfNeeded(InferenceEngine::Blob::CPtr src, InferenceEngine::Blob::Ptr dst, bool bInput) { auto bufferDst = dst->buffer(); auto ptrDst = bufferDst.as(); auto bufferSrc = src->cbuffer(); auto ptrSrc = bufferSrc.as(); ptrdiff_t szDst = dst->byteSize(); ptrdiff_t szSrc = src->byteSize(); if (bInput) { ptrdiff_t offset = szSrc != szDst ? _batchId * szDst / _batchSize : 0; if ((ptrDst + offset) == ptrSrc) return; else memcpy(ptrDst + offset, ptrSrc, szSrc); } else { ptrdiff_t offset = szSrc != szDst ? _batchId * szSrc / _batchSize : 0; if ((ptrSrc + offset) == ptrDst) return; else memcpy(ptrDst, ptrSrc + offset, szDst); } } void AutoBatchInferRequest::CopyOutputsIfNeeded() { for (const auto& it : _networkOutputs) { auto& name = it.first; // this request is already in BUSY state, so using the internal functions safely CopyBlobIfNeeded(_myBatchedRequestWrapper._inferRequestBatched->GetBlob(name), GetBlob(name), false); } } AutoBatchAsyncInferRequest::AutoBatchAsyncInferRequest( const AutoBatchInferRequest::Ptr& inferRequest, InferenceEngine::SoIInferRequestInternal& inferRequestWithoutBatch, const ITaskExecutor::Ptr& callbackExecutor) : AsyncInferRequestThreadSafeDefault(inferRequest, nullptr, callbackExecutor), _inferRequestWithoutBatch(inferRequestWithoutBatch), _inferRequest{inferRequest} { // this executor starts the inference while the task (checking the result) is passed to the next stage struct ThisRequestExecutor : public ITaskExecutor { explicit ThisRequestExecutor(AutoBatchAsyncInferRequest* _this_) : _this{_this_} {} void run(Task task) override { auto& workerInferRequest = _this->_inferRequest->_myBatchedRequestWrapper; std::pair t; t.first = _this; t.second = std::move(task); workerInferRequest._tasks.push(t); // it is ok to call size() here as the queue only grows (and the bulk removal happens under the mutex) const int sz = static_cast(workerInferRequest._tasks.size()); if (sz == workerInferRequest._batchSize) { workerInferRequest._cond.notify_one(); } }; AutoBatchAsyncInferRequest* _this = nullptr; }; _pipeline = {{/*TaskExecutor*/ std::make_shared(this), /*task*/ [this] { if (this->_inferRequest->_exceptionPtr) // if the exception happened in the batch1 fallback std::rethrow_exception(this->_inferRequest->_exceptionPtr); auto& batchReq = this->_inferRequest->_myBatchedRequestWrapper; if (batchReq._exceptionPtr) // when the batchN execution failed std::rethrow_exception(batchReq._exceptionPtr); // in the case of non-batched execution the blobs were set explicitly if (AutoBatchInferRequest::eExecutionFlavor::BATCH_EXECUTED == this->_inferRequest->_wasBatchedRequestUsed) this->_inferRequest->CopyOutputsIfNeeded(); }}}; } std::map AutoBatchAsyncInferRequest::GetPerformanceCounts() const { CheckState(); if (AutoBatchInferRequest::eExecutionFlavor::BATCH_EXECUTED == _inferRequest->_wasBatchedRequestUsed) return _inferRequest->_myBatchedRequestWrapper._inferRequestBatched->GetPerformanceCounts(); else return _inferRequestWithoutBatch->GetPerformanceCounts(); } void AutoBatchAsyncInferRequest::Infer_ThreadUnsafe() { InferUsingAsync(); } AutoBatchAsyncInferRequest::~AutoBatchAsyncInferRequest() { StopAndWait(); } // ------------------------------AutoBatchExecutableNetwork---------------------------- AutoBatchExecutableNetwork::AutoBatchExecutableNetwork( const InferenceEngine::SoExecutableNetworkInternal& networkWithBatch, const InferenceEngine::SoExecutableNetworkInternal& networkWithoutBatch, const DeviceInformation& networkDevice, const std::unordered_map& config, const std::set& batchedInputs, const std::set& batchedOutputs) : InferenceEngine::ExecutableNetworkThreadSafeDefault(nullptr, std::make_shared()), _network{networkWithBatch}, _networkWithoutBatch{networkWithoutBatch}, _config{config}, _batchedInputs(batchedInputs), _batchedOutputs(batchedOutputs) { // WA for gcc 4.8 ( fails compilation with member init-list) _device = networkDevice; auto time_out = config.find(CONFIG_KEY(AUTO_BATCH_TIMEOUT)); IE_ASSERT(time_out != config.end()); _timeOut = ParseTimeoutValue(time_out->second.as()); } AutoBatchExecutableNetwork::~AutoBatchExecutableNetwork() { _terminate = true; for (auto w : _workerRequests) { w->_thread.join(); } _workerRequests.clear(); } unsigned int AutoBatchExecutableNetwork::ParseTimeoutValue(const std::string& s) { auto val = std::stoi(s); if (val < 0) IE_THROW(ParameterMismatch) << "Value for the " << CONFIG_KEY(AUTO_BATCH_TIMEOUT) << " should be unsigned int"; return val; } std::shared_ptr AutoBatchExecutableNetwork::GetContext() const { return _networkWithoutBatch->GetContext(); } InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequestImpl( InferenceEngine::InputsDataMap networkInputs, InferenceEngine::OutputsDataMap networkOutputs) { auto workerRequestPtrAndId = GetWorkerInferRequest(); return std::make_shared(networkInputs, networkOutputs, workerRequestPtrAndId.first, workerRequestPtrAndId.second, _device.batchForDevice, _batchedInputs, _batchedOutputs); } InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequestImpl( const std::vector>& inputs, const std::vector>& outputs) { if (!this->_plugin || !_plugin->IsNewAPI()) return nullptr; auto workerRequestPtrAndId = GetWorkerInferRequest(); return std::make_shared(inputs, outputs, workerRequestPtrAndId.first, workerRequestPtrAndId.second, _device.batchForDevice, _batchedInputs, _batchedOutputs); } std::pair AutoBatchExecutableNetwork::GetWorkerInferRequest() { auto num = _numRequestsCreated++; std::lock_guard lock(_workerRequestsMutex); auto batch_id = num % _device.batchForDevice; if (!batch_id) { // need new request _workerRequests.push_back(std::make_shared()); auto workerRequestPtr = _workerRequests.back().get(); workerRequestPtr->_inferRequestBatched = {_network->CreateInferRequest(), _network._so}; workerRequestPtr->_batchSize = _device.batchForDevice; workerRequestPtr->_completionTasks.resize(workerRequestPtr->_batchSize); workerRequestPtr->_inferRequestBatched->SetCallback( [workerRequestPtr, this](std::exception_ptr exceptionPtr) mutable { if (exceptionPtr) workerRequestPtr->_exceptionPtr = exceptionPtr; IE_ASSERT(workerRequestPtr->_completionTasks.size() == (size_t)workerRequestPtr->_batchSize); // notify the individual requests on the completion for (int c = 0; c < workerRequestPtr->_batchSize; c++) { workerRequestPtr->_completionTasks[c](); } // reset the timeout workerRequestPtr->_cond.notify_one(); }); workerRequestPtr->_thread = std::thread([workerRequestPtr, this] { while (1) { std::cv_status status; { std::unique_lock lock(workerRequestPtr->_mutex); status = workerRequestPtr->_cond.wait_for(lock, std::chrono::milliseconds(_timeOut)); } if (_terminate) { break; } else { // as we pop the tasks from the queue only here // it is ok to call size() (as the _tasks can only grow in parallel) const int sz = static_cast(workerRequestPtr->_tasks.size()); if (sz == workerRequestPtr->_batchSize) { std::pair t; for (int n = 0; n < sz; n++) { IE_ASSERT(workerRequestPtr->_tasks.try_pop(t)); workerRequestPtr->_completionTasks[n] = std::move(t.second); t.first->_inferRequest->CopyInputsIfNeeded(); t.first->_inferRequest->_wasBatchedRequestUsed = AutoBatchInferRequest::eExecutionFlavor::BATCH_EXECUTED; } workerRequestPtr->_inferRequestBatched->StartAsync(); } else if ((status == std::cv_status::timeout) && sz) { // timeout to collect the batch is over, have to execute the requests in the batch1 mode std::pair t; // popping all tasks collected by the moment of the time-out and execute each with batch1 std::atomic arrived = {0}; std::promise all_completed; auto all_completed_future = all_completed.get_future(); for (int n = 0; n < sz; n++) { IE_ASSERT(workerRequestPtr->_tasks.try_pop(t)); t.first->_inferRequestWithoutBatch->SetCallback( [t, sz, &arrived, &all_completed](std::exception_ptr p) { if (p) t.first->_inferRequest->_exceptionPtr = p; t.second(); if (sz == ++arrived) all_completed.set_value(); }); t.first->_inferRequest->_wasBatchedRequestUsed = AutoBatchInferRequest::eExecutionFlavor::TIMEOUT_EXECUTED; t.first->_inferRequest->SetBlobsToAnotherRequest(t.first->_inferRequestWithoutBatch); t.first->_inferRequestWithoutBatch->StartAsync(); } all_completed_future.get(); // now when all the tasks for this batch are completed, start waiting for the timeout again } } } }); } return {*_workerRequests.back(), static_cast(batch_id)}; } InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequest() { if (!_network) { auto res = _networkWithoutBatch->CreateInferRequest(); res->setPointerToExecutableNetworkInternal(shared_from_this()); res->setPointerToSo(_networkWithoutBatch._so); _so = _networkWithoutBatch._so; return res; } // trying to create the new API request first IInferRequestInternal::Ptr syncRequestImpl = CreateInferRequestImpl(_parameters, _results); if (!syncRequestImpl) syncRequestImpl = CreateInferRequestImpl(_networkInputs, _networkOutputs); syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this()); InferenceEngine::SoIInferRequestInternal inferRequestWithoutBatch = {_networkWithoutBatch->CreateInferRequest(), _networkWithoutBatch._so}; return std::make_shared( std::static_pointer_cast(syncRequestImpl), inferRequestWithoutBatch, _callbackExecutor); } std::shared_ptr AutoBatchExecutableNetwork::GetExecGraphInfo() { return _network && _network->GetExecGraphInfo() ? _network->GetExecGraphInfo() : _networkWithoutBatch->GetExecGraphInfo(); } void AutoBatchExecutableNetwork::SetConfig(const std::map& config) { auto timeout = config.find(CONFIG_KEY(AUTO_BATCH_TIMEOUT)); if (timeout == config.end() || config.size() > 1) { IE_THROW() << "The only config that can be changed on the fly for the AutoBatching the is the " << CONFIG_KEY(AUTO_BATCH_TIMEOUT); } else { _timeOut = ParseTimeoutValue(timeout->second.as()); } } InferenceEngine::Parameter AutoBatchExecutableNetwork::GetConfig(const std::string& name) const { auto it = _config.find(name); if (it != _config.end()) { return it->second; } else { // find config key among networks config keys auto param = _networkWithoutBatch->GetMetric(METRIC_KEY(SUPPORTED_CONFIG_KEYS)); for (auto&& configKey : param.as>()) { if (configKey == name) { return _networkWithoutBatch->GetConfig(configKey); } } IE_THROW(NotFound) << name << " not found in the ExecutableNetwork config"; } } InferenceEngine::Parameter AutoBatchExecutableNetwork::GetMetric(const std::string& name) const { if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) { auto reqs = 0; try { auto hint = _networkWithoutBatch->GetConfig(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as(); reqs = InferenceEngine::PerfHintsConfig::CheckPerformanceHintRequestValue(hint); if (!reqs) // no limitations from user, let's deduce the full blown #requests // (multiplied by the devices capabilities to run multiple requests for further perf) reqs = _device.batchForDevice * _networkWithoutBatch->GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as(); } catch (const InferenceEngine::Exception&) { } reqs = std::max(reqs, _device.batchForDevice); // round up to the possible user's value IE_SET_METRIC_RETURN(OPTIMAL_NUMBER_OF_INFER_REQUESTS, reqs); } else if (name == METRIC_KEY(NETWORK_NAME)) { IE_SET_METRIC_RETURN(NETWORK_NAME, _networkWithoutBatch->GetMetric(METRIC_KEY(NETWORK_NAME)).as()); } else if (name == METRIC_KEY(SUPPORTED_METRICS)) { IE_SET_METRIC_RETURN(SUPPORTED_METRICS, {METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS), METRIC_KEY(SUPPORTED_METRICS), METRIC_KEY(NETWORK_NAME), METRIC_KEY(SUPPORTED_CONFIG_KEYS)}); } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) { IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, {CONFIG_KEY(AUTO_BATCH_TIMEOUT)}); // only timeout can be changed on the fly } else { IE_THROW() << "Unsupported Network metric: " << name; } } // ------------------------------AutoBatchInferencePlugin---------------------------- namespace { std::map mergeConfigs(std::map config, const std::map& local) { for (auto&& kvp : local) { config[kvp.first] = kvp.second; } return config; } } // namespace DeviceInformation AutoBatchInferencePlugin::ParseBatchDevice(const std::string& deviceWithBatch) { auto&& d = deviceWithBatch; auto openingBracket = d.find_first_of('('); auto closingBracket = d.find_first_of(')', openingBracket); auto deviceName = d.substr(0, openingBracket); int batch = 0; if (closingBracket != std::string::npos && openingBracket < closingBracket) { batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1)); if (batch <= 0) { IE_THROW() << "Batch value for '" << deviceName << "' must be > 0, while " << batch << "is passed"; } } return {deviceName, {{}}, batch}; } DeviceInformation AutoBatchInferencePlugin::ParseMetaDevice(const std::string& devicesBatchCfg, const std::map& config) const { auto getDeviceConfig = [&](const DeviceName& deviceWithID) { DeviceIDParser deviceParser(deviceWithID); std::string deviceName = deviceParser.getDeviceName(); std::map tconfig = mergeConfigs(_config, config); // set device ID if any std::string deviceIDLocal = deviceParser.getDeviceID(); if (!deviceIDLocal.empty()) { tconfig[PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal; } // passthrough the cache dir to core->loadnetwork when underlying device does not support cache dir auto deviceConfig = GetCore()->GetSupportedConfig(deviceName, tconfig); if (tconfig.find(CONFIG_KEY(CACHE_DIR)) != tconfig.end() && deviceConfig.find(CONFIG_KEY(CACHE_DIR)) == deviceConfig.end()) { auto tmpiter = tconfig.find(CONFIG_KEY(CACHE_DIR)); if (tmpiter != tconfig.end()) deviceConfig.insert({tmpiter->first, tmpiter->second}); } return deviceConfig; }; auto metaDevice = ParseBatchDevice(devicesBatchCfg); metaDevice.config = getDeviceConfig(metaDevice.deviceName); auto cfg = config; // check that no irrelevant config-keys left for (auto k : config) { const auto& name = k.first; auto found_in_supported_cfg = std::find(supported_configKeys.begin(), supported_configKeys.end(), k.first); auto found_in_device_cfg = metaDevice.config.find(k.first); if (found_in_device_cfg == metaDevice.config.end() && found_in_supported_cfg == supported_configKeys.end()) { IE_THROW() << "Unsupported config key: " << name; } } return metaDevice; } RemoteContext::Ptr AutoBatchInferencePlugin::CreateContext(const InferenceEngine::ParamMap& config) { auto cfg = config; auto it = cfg.find(CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)); if (it == cfg.end()) IE_THROW() << "Value for KEY_AUTO_BATCH_DEVICE_CONFIG is not set"; auto val = it->second.as(); auto core = GetCore(); if (!core) return nullptr; auto metaDevice = ParseMetaDevice(val, std::map()); cfg.erase(it); return core->CreateContext(metaDevice.deviceName, cfg); } Parameter AutoBatchInferencePlugin::GetConfig(const std::string& name, const std::map& options) const { if (supported_configKeys.end() != std::find(supported_configKeys.begin(), supported_configKeys.end(), name)) { auto it = _config.find(name); if (it == _config.end()) { IE_THROW() << "Value for " << name << " is not set"; } else { return {it->second}; } } else { IE_THROW() << "Unsupported config key: " << name; } } void AutoBatchInferencePlugin::CheckConfig(const std::map& config) { for (auto&& kvp : config) { const auto name = kvp.first; const auto val = kvp.second; if (supported_configKeys.end() == std::find(supported_configKeys.begin(), supported_configKeys.end(), name)) IE_THROW() << "Unsupported config key: " << name; if (name == CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)) { ParseBatchDevice(val); } else if (name == CONFIG_KEY(AUTO_BATCH_TIMEOUT)) { try { auto t = std::stoi(val); if (t < 0) IE_THROW(ParameterMismatch); } catch (const std::exception&) { IE_THROW(ParameterMismatch) << " Expecting unsigned int value for " << CONFIG_KEY(AUTO_BATCH_TIMEOUT) << " got " << val; } } } } void AutoBatchInferencePlugin::SetConfig(const std::map& config) { CheckConfig(config); for (auto&& kvp : config) { _config[kvp.first] = kvp.second; } } static const Version version = {{2, 1}, CI_BUILD_NUMBER, "AutoBatchPlugin"}; IE_DEFINE_PLUGIN_CREATE_FUNCTION(AutoBatchInferencePlugin, version) AutoBatchInferencePlugin::AutoBatchInferencePlugin() { _pluginName = "BATCH"; _config[CONFIG_KEY(AUTO_BATCH_TIMEOUT)] = "1000"; // default value, in ms } InferenceEngine::Parameter AutoBatchInferencePlugin::GetMetric( const std::string& name, const std::map& options) const { if (name == METRIC_KEY(SUPPORTED_METRICS)) { std::vector metrics; metrics.push_back(METRIC_KEY(SUPPORTED_METRICS)); metrics.push_back(METRIC_KEY(FULL_DEVICE_NAME)); metrics.push_back(METRIC_KEY(SUPPORTED_CONFIG_KEYS)); IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics); } else if (name == METRIC_KEY(FULL_DEVICE_NAME)) { IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, _pluginName); } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) { IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, supported_configKeys); } else { IE_THROW(NotFound) << "Unsupported metric key " << name; } } IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadExeNetworkImpl( const InferenceEngine::CNNNetwork& network, const std::map& config) { return LoadNetworkImpl(network, nullptr, config); } InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadNetworkImpl( const InferenceEngine::CNNNetwork& network, const std::shared_ptr ctx, const std::map& config) { auto core = GetCore(); if (core == nullptr) { IE_THROW() << "Please, work with Auto-Batching device via InferencEngine::Core object"; } auto fullConfig = mergeConfigs(_config, config); auto device_batch = fullConfig.find(CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)); if (device_batch == fullConfig.end()) { IE_THROW() << "KEY_AUTO_BATCH key is not set for BATCH device"; } auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig); const auto& deviceName = metaDevice.deviceName; const auto& deviceConfig = metaDevice.config; auto deviceConfigNoAutoBatch = deviceConfig; // avoid recursive auto-batching deviceConfigNoAutoBatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO); std::set batched_inputs; std::set batched_outputs; // check that the auto-batching is applicable in general try { // if applicable, the Auto-Batching is implicitly enabled via the performance hints const auto tput = CONFIG_VALUE(THROUGHPUT); const bool bTputInPlg = core->GetConfig(deviceName, CONFIG_KEY(PERFORMANCE_HINT)).as() == tput; const auto& mode = deviceConfig.find(CONFIG_KEY(PERFORMANCE_HINT)); const bool bTputInLoadCfg = (mode != deviceConfig.end() && mode->second == tput); // if the auto-batching is enabled implicitly, check the dims carefully, to avoid outstanding failures const bool check_dims = (bTputInPlg || bTputInLoadCfg); CNNNetwork clonedNetwork(InferenceEngine::details::cloneNetwork(network)); auto function = clonedNetwork.getFunction(); // find the batch dim ov::pass::Manager m; m.register_pass(); m.register_pass(false, check_dims); m.run_passes(function); // do not reshape/re-batch originally batched networks and when there are no inputs with the N* layouts // input(s) should have the batch dim as the first dim (current limitation of the auto-batching impl) const auto& params = function->get_parameters(); for (size_t input_id = 0; input_id < params.size(); input_id++) { const auto& input = params[input_id]; const auto& shape = input->get_partial_shape(); // currently no plugin support batched execution for dynamic networks if (shape.is_dynamic()) IE_THROW(NotImplemented) << "Auto-batching does not support dynamic networks!"; // check the batch dim: either 0th (and the original batch size of 1) or none if (shape.size() && ov::DimensionTracker::get_label(shape[0])) { const auto& static_shape = input->get_shape(); if (static_shape[0] != 1) IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!"; batched_inputs.insert( ngraph::op::util::get_ie_output_name(params[input_id]->output(0))); // batched dim for the input } else { // if the 0-th dim is not for the batch, then we support only the case when NONE dimension is batch for (size_t s = 1; s < shape.size(); s++) if (ov::DimensionTracker::get_label(shape[s])) IE_THROW(NotImplemented) << "Auto-batching operates only networks with inputs/outputs batched by 0th dimension"; } } const auto& results = function->get_results(); for (size_t output_id = 0; output_id < results.size(); output_id++) { const auto& output = results[output_id]; const auto& shape = output->get_output_partial_shape(0); if (shape.is_dynamic()) IE_THROW(NotImplemented) << "Auto-batching does not support dynamic networks!"; // check the batch dim: either 0th (and the original batch size of 1) or none if (shape.size() && ov::DimensionTracker::get_label(shape[0])) { if (shape[0] != 1) IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!"; const auto& node = output->input_value(0); batched_outputs.insert(ngraph::op::util::get_ie_output_name( ov::Output(node.get_node(), node.get_index()))); } else { // if the 0-th dim is not for the batch, then we support only the case when NONE dimension is batch for (size_t s = 1; s < shape.size(); s++) if (ov::DimensionTracker::get_label(shape[s])) IE_THROW(NotImplemented) << "Auto-batching operates only networks with outputs batched by 0th dimension"; } } if (!batched_inputs.size() || !batched_outputs.size()) IE_THROW(NotImplemented) << "Auto-batching supports only networks with inputs/outputs featuring batched dim!"; } catch (...) { metaDevice.batchForDevice = 1; } if (!metaDevice.batchForDevice) { unsigned int requests = 0; // batch size is not set explicitly via device name e.g. BATCH:GPU(4) // let's query the optimal batch size std::map options; options["MODEL_PTR"] = std::const_pointer_cast(network.getFunction()); auto optBatchSize = core->GetMetric(deviceName, METRIC_KEY(OPTIMAL_BATCH_SIZE), options).as(); auto res = core->GetConfig(deviceName, CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as(); requests = PerfHintsConfig::CheckPerformanceHintRequestValue(res); const auto& reqs = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)); if (reqs != config.end()) requests = static_cast(PerfHintsConfig::CheckPerformanceHintRequestValue(reqs->second)); if (requests) optBatchSize = std::max(1u, std::min(requests, optBatchSize)); if (optBatchSize > 2) // batching is usually in-efficient for batch<4 (as batch1 kernels are heavily optimized) metaDevice.batchForDevice = optBatchSize; else metaDevice.batchForDevice = 1; } auto report_footprint = [](std::shared_ptr pCore, std::string device) -> size_t { size_t footprint = 0; // TODO: use the per-network metric (22.2) rather than plugin-level auto stats = pCore->GetMetric(device, ov::intel_gpu::memory_statistics.name()).as>(); for (auto s : stats) footprint += s.second; return footprint; }; size_t batch1_footprint = 0; if (deviceName.find("GPU") != std::string::npos) batch1_footprint = report_footprint(core, deviceName); auto executableNetworkWithoutBatch = ctx ? core->LoadNetwork(network, ctx, deviceConfigNoAutoBatch) : core->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch); if (deviceName.find("GPU") != std::string::npos) { batch1_footprint = report_footprint(core, deviceName) - batch1_footprint; if (batch1_footprint) { const auto total_mem = GetCore()->GetMetric(deviceName, GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE)).as(); const int estimated_batch = static_cast((total_mem - batch1_footprint) / batch1_footprint); int closest = static_cast(pow(2, floor(log(estimated_batch) / log(2)))); closest = std::max(1, closest); metaDevice.batchForDevice = std::min(metaDevice.batchForDevice, closest); } } // auto-batch settings std::unordered_map networkConfig; for (auto c : fullConfig) { if (supported_configKeys.end() != std::find(supported_configKeys.begin(), supported_configKeys.end(), c.first)) networkConfig.insert(c); } InferenceEngine::SoExecutableNetworkInternal executableNetworkWithBatch; if (metaDevice.batchForDevice > 1 && batched_inputs.size()) { try { CNNNetwork reshaped(InferenceEngine::details::cloneNetwork(network)); ICNNNetwork::InputShapes shapes = reshaped.getInputShapes(); for (const auto& input : batched_inputs) shapes[input][0] = metaDevice.batchForDevice; reshaped.reshape(shapes); executableNetworkWithBatch = ctx ? core->LoadNetwork(reshaped, ctx, deviceConfigNoAutoBatch) : core->LoadNetwork(reshaped, deviceName, deviceConfigNoAutoBatch); } catch (...) { metaDevice.batchForDevice = 1; } } return std::make_shared(executableNetworkWithBatch, executableNetworkWithoutBatch, metaDevice, networkConfig, batched_inputs, batched_outputs); } InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadExeNetworkImpl( const InferenceEngine::CNNNetwork& network, const std::shared_ptr& context, const std::map& config) { return LoadNetworkImpl(network, context, config); } InferenceEngine::QueryNetworkResult AutoBatchInferencePlugin::QueryNetwork( const InferenceEngine::CNNNetwork& network, const std::map& config) const { auto core = GetCore(); if (!core) return InferenceEngine::QueryNetworkResult(); auto cfg = config; for (auto c : cfg) { if (c.first == CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)) { auto val = c.second; cfg.erase(c.first); auto metaDevice = ParseMetaDevice(val, cfg); return core->QueryNetwork(network, metaDevice.deviceName, cfg); } } IE_THROW() << "Value for KEY_AUTO_BATCH is not set"; } } // namespace AutoBatchPlugin