Files
openvino/src/plugins/auto_batch/auto_batch.cpp
2022-02-03 13:04:36 +03:00

843 lines
40 KiB
C++

// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "auto_batch.hpp"
#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
#include <ie_icore.hpp>
#include <ie_ngraph_utils.hpp>
#include <ie_performance_hints.hpp>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <transformations/utils/utils.hpp>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "openvino/runtime/intel_gpu/properties.hpp"
namespace AutoBatchPlugin {
using namespace InferenceEngine;
std::vector<std::string> supported_configKeys = {CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG), CONFIG_KEY(AUTO_BATCH_TIMEOUT)};
template <Precision::ePrecision precision>
Blob::Ptr create_shared_blob_on_top_of_batched_blob(Blob::Ptr batched_blob, size_t batch_id, size_t batch_num) {
typedef typename PrecisionTrait<precision>::value_type TYPE;
typedef typename std::add_pointer<TYPE>::type TYPEPTR;
auto ptr = batched_blob->buffer().as<TYPEPTR>();
auto sizePerBatch = batched_blob->size() / batch_num;
auto layout = batched_blob->getTensorDesc().getLayout();
SizeVector dims = batched_blob->getTensorDesc().getDims();
// the below code is a placeholder for the WIP (22.1) functionality
// that will check the reshaping by the batch is robust (CVS-51744)
if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
layout == InferenceEngine::Layout::NDHWC) {
dims[0] = 1;
assert(batched_blob->getTensorDesc().getPrecision() == precision);
return make_shared_blob<TYPE>({precision, dims, batched_blob->getTensorDesc().getLayout()},
ptr + sizePerBatch * batch_id,
sizePerBatch);
} else {
// same blob for all requests (e.g. constants)
return make_shared_blob<TYPE>({precision, dims, batched_blob->getTensorDesc().getLayout()}, ptr);
}
}
// ------------------------------AutoBatchInferRequest----------------------------
AutoBatchInferRequest::AutoBatchInferRequest(const std::vector<std::shared_ptr<const ov::Node>>& inputs,
const std::vector<std::shared_ptr<const ov::Node>>& outputs,
AutoBatchExecutableNetwork::WorkerInferRequest& workerRequest,
int batch_id,
int num_batch,
bool needPerfCounters)
: IInferRequestInternal(inputs, outputs),
_myBatchedRequestWrapper(workerRequest),
_needPerfCounters(needPerfCounters),
_batchId(batch_id),
_batchSize(num_batch) {
ShareBlobsWithBatchRequest();
}
AutoBatchInferRequest::AutoBatchInferRequest(const InputsDataMap& networkInputs,
const OutputsDataMap& networkOutputs,
AutoBatchExecutableNetwork::WorkerInferRequest& workerRequest,
int batch_id,
int num_batch,
bool needPerfCounters)
: IInferRequestInternal(networkInputs, networkOutputs),
_myBatchedRequestWrapper(workerRequest),
_needPerfCounters(needPerfCounters),
_batchId(batch_id),
_batchSize(num_batch) {
ShareBlobsWithBatchRequest();
}
void AutoBatchInferRequest::ShareBlobsWithBatchRequest() {
// Allocate all input blobs
for (const auto& it : _networkInputs) {
auto blob = _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first);
Blob::Ptr res;
switch (it.second->getTensorDesc().getPrecision()) {
case InferenceEngine::Precision::FP32:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::FP32>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::I32:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I32>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::I8:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I8>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::U16:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U16>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::I16:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I16>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::U8:
case InferenceEngine::Precision::BOOL:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U8>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
default:
IE_THROW() << "Unsupported input precision " << it.second->getTensorDesc().getPrecision();
}
_inputs[it.first] = res;
}
// Allocate all output blobs
for (const auto& it : _networkOutputs) {
auto blob = _myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first);
Blob::Ptr res;
switch (it.second->getTensorDesc().getPrecision()) {
case InferenceEngine::Precision::FP32:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::FP32>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::I32:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I32>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::I8:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I8>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::U16:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U16>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::I16:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I16>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
case InferenceEngine::Precision::U8:
case InferenceEngine::Precision::BOOL:
res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U8>(
_myBatchedRequestWrapper._inferRequestBatched->GetBlob(it.first),
_batchId,
_batchSize);
break;
default:
IE_THROW(NotImplemented) << "Unsupported input precision " << it.second->getTensorDesc().getPrecision();
}
_outputs[it.first] = res;
}
}
void AutoBatchInferRequest::SetBlobsToAnotherRequest(SoIInferRequestInternal& req) {
for (const auto& it : _networkInputs) {
auto& name = it.first;
// this request is already in BUSY state, so using the internal functions safely
auto blob = GetBlob(name);
if (req->GetBlob(name) != blob)
req->SetBlob(name, blob);
}
for (const auto& it : _networkOutputs) {
auto& name = it.first;
// this request is already in BUSY state, so using the internal functions safely
auto blob = GetBlob(name);
if (req->GetBlob(name) != blob)
req->SetBlob(name, blob);
}
}
void AutoBatchInferRequest::CopyInputsIfNeeded() {
for (const auto& it : _networkInputs) {
auto& name = it.first;
// this request is already in BUSY state, so using the internal functions safely
CopyBlobIfNeeded(GetBlob(name), _myBatchedRequestWrapper._inferRequestBatched->GetBlob(name), true);
}
}
void AutoBatchInferRequest::CopyBlobIfNeeded(InferenceEngine::Blob::CPtr src,
InferenceEngine::Blob::Ptr dst,
bool bInput) {
auto bufferDst = dst->buffer();
auto ptrDst = bufferDst.as<char*>();
auto bufferSrc = src->cbuffer();
auto ptrSrc = bufferSrc.as<const char*>();
ptrdiff_t szDst = dst->byteSize();
ptrdiff_t szSrc = src->byteSize();
if (bInput) {
ptrdiff_t offset = szSrc != szDst ? _batchId * szDst / _batchSize : 0;
if ((ptrDst + offset) == ptrSrc)
return;
else
memcpy(ptrDst + offset, ptrSrc, szSrc);
} else {
ptrdiff_t offset = szSrc != szDst ? _batchId * szSrc / _batchSize : 0;
if ((ptrSrc + offset) == ptrDst)
return;
else
memcpy(ptrDst, ptrSrc + offset, szDst);
}
}
void AutoBatchInferRequest::CopyOutputsIfNeeded() {
for (const auto& it : _networkOutputs) {
auto& name = it.first;
// this request is already in BUSY state, so using the internal functions safely
CopyBlobIfNeeded(_myBatchedRequestWrapper._inferRequestBatched->GetBlob(name), GetBlob(name), false);
}
}
std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> AutoBatchInferRequest::GetPerformanceCounts() const {
return _perfMap;
}
AutoBatchAsyncInferRequest::AutoBatchAsyncInferRequest(
const AutoBatchInferRequest::Ptr& inferRequest,
const bool needPerfCounters,
InferenceEngine::SoIInferRequestInternal& inferRequestWithoutBatch,
const ITaskExecutor::Ptr& callbackExecutor)
: AsyncInferRequestThreadSafeDefault(inferRequest, nullptr, callbackExecutor),
_inferRequestWithoutBatch(inferRequestWithoutBatch),
_inferRequest{inferRequest} {
// this executor starts the inference while the task (checking the result) is passed to the next stage
struct ThisRequestExecutor : public ITaskExecutor {
explicit ThisRequestExecutor(AutoBatchAsyncInferRequest* _this_) : _this{_this_} {}
void run(Task task) override {
auto& workerInferRequest = _this->_inferRequest->_myBatchedRequestWrapper;
std::pair<AutoBatchAsyncInferRequest*, InferenceEngine::Task> t;
t.first = _this;
t.second = std::move(task);
workerInferRequest._tasks.push(t);
// it is ok to call size() here as the queue only grows (and the bulk removal happens under the mutex)
const int sz = workerInferRequest._tasks.size();
if (sz == workerInferRequest._batchSize) {
workerInferRequest._cond.notify_one();
}
};
AutoBatchAsyncInferRequest* _this = nullptr;
};
_pipeline = {{/*TaskExecutor*/ std::make_shared<ThisRequestExecutor>(this), /*task*/ [this, needPerfCounters] {
if (this->_inferRequest->_exceptionPtr) // if the exception happened in the batch1 fallback
std::rethrow_exception(this->_inferRequest->_exceptionPtr);
auto& batchReq = this->_inferRequest->_myBatchedRequestWrapper;
if (batchReq._exceptionPtr) // when the batchN execution failed
std::rethrow_exception(batchReq._exceptionPtr);
this->_inferRequest->CopyOutputsIfNeeded();
if (needPerfCounters) {
try {
this->_inferRequest->_perfMap = batchReq._inferRequestBatched->GetPerformanceCounts();
} catch (...) {
}
}
}}};
}
void AutoBatchAsyncInferRequest::Infer_ThreadUnsafe() {
InferUsingAsync();
}
AutoBatchAsyncInferRequest::~AutoBatchAsyncInferRequest() {
StopAndWait();
}
// ------------------------------AutoBatchExecutableNetwork----------------------------
AutoBatchExecutableNetwork::AutoBatchExecutableNetwork(
const InferenceEngine::SoExecutableNetworkInternal& networkWithBatch,
const InferenceEngine::SoExecutableNetworkInternal& networkWithoutBatch,
const DeviceInformation& networkDevice,
const std::unordered_map<std::string, InferenceEngine::Parameter>& config,
const bool needPerfCounters)
: InferenceEngine::ExecutableNetworkThreadSafeDefault(nullptr,
std::make_shared<InferenceEngine::ImmediateExecutor>()),
_network{networkWithBatch},
_networkWithoutBatch{networkWithoutBatch},
_config{config},
_needPerfCounters{needPerfCounters} {
// WA for gcc 4.8 ( fails compilation with member init-list)
_device = networkDevice;
auto time_out = config.find(CONFIG_KEY(AUTO_BATCH_TIMEOUT));
if (time_out != config.end())
_timeOut = ParseTimeoutValue(time_out->second.as<std::string>());
}
AutoBatchExecutableNetwork::~AutoBatchExecutableNetwork() {
_terminate = true;
for (auto w : _workerRequests) {
w->_thread.join();
}
_workerRequests.clear();
}
unsigned int AutoBatchExecutableNetwork::ParseTimeoutValue(const std::string& s) {
auto val = std::stoi(s);
if (val < 0)
IE_THROW(ParameterMismatch) << "Value for the " << CONFIG_KEY(AUTO_BATCH_TIMEOUT) << " should be unsigned int";
return val;
}
std::shared_ptr<InferenceEngine::RemoteContext> AutoBatchExecutableNetwork::GetContext() const {
return _network->GetContext();
}
InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequestImpl(
InferenceEngine::InputsDataMap networkInputs,
InferenceEngine::OutputsDataMap networkOutputs) {
auto workerRequestPtrAndId = GetWorkerInferRequest();
return std::make_shared<AutoBatchInferRequest>(networkInputs,
networkOutputs,
workerRequestPtrAndId.first,
workerRequestPtrAndId.second,
_device.batchForDevice,
_needPerfCounters);
}
InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequestImpl(
const std::vector<std::shared_ptr<const ov::Node>>& inputs,
const std::vector<std::shared_ptr<const ov::Node>>& outputs) {
if (!this->_plugin)
return nullptr;
const auto& core = _plugin->GetCore();
if (!core || !core->isNewAPI())
return nullptr;
auto workerRequestPtrAndId = GetWorkerInferRequest();
return std::make_shared<AutoBatchInferRequest>(inputs,
outputs,
workerRequestPtrAndId.first,
workerRequestPtrAndId.second,
_device.batchForDevice,
_needPerfCounters);
}
std::pair<AutoBatchExecutableNetwork::WorkerInferRequest&, int> AutoBatchExecutableNetwork::GetWorkerInferRequest() {
auto num = _numRequestsCreated++;
std::lock_guard<std::mutex> lock(_workerRequestsMutex);
auto batch_id = num % _device.batchForDevice;
if (!batch_id) { // need new request
_workerRequests.push_back(std::make_shared<WorkerInferRequest>());
auto workerRequestPtr = _workerRequests.back().get();
workerRequestPtr->_inferRequestBatched = {_network->CreateInferRequest(), _network._so};
workerRequestPtr->_batchSize = _device.batchForDevice;
workerRequestPtr->_completionTasks.resize(workerRequestPtr->_batchSize);
workerRequestPtr->_inferRequestBatched->SetCallback(
[workerRequestPtr, this](std::exception_ptr exceptionPtr) mutable {
if (exceptionPtr)
workerRequestPtr->_exceptionPtr = exceptionPtr;
IE_ASSERT(workerRequestPtr->_completionTasks.size() == (size_t)workerRequestPtr->_batchSize);
// notify the individual requests on the completion
for (int c = 0; c < workerRequestPtr->_batchSize; c++) {
workerRequestPtr->_completionTasks[c]();
}
// reset the timeout
workerRequestPtr->_cond.notify_one();
});
workerRequestPtr->_thread = std::thread([workerRequestPtr, this] {
while (1) {
std::cv_status status;
{
std::unique_lock<std::mutex> lock(workerRequestPtr->_mutex);
status = workerRequestPtr->_cond.wait_for(lock, std::chrono::milliseconds(_timeOut));
}
if (_terminate) {
break;
} else {
// as we pop the tasks from the queue only here
// it is ok to call size() (as the _tasks can only grow in parallel)
const int sz = workerRequestPtr->_tasks.size();
if (sz == workerRequestPtr->_batchSize) {
std::pair<AutoBatchAsyncInferRequest*, InferenceEngine::Task> t;
for (int n = 0; n < sz; n++) {
IE_ASSERT(workerRequestPtr->_tasks.try_pop(t));
workerRequestPtr->_completionTasks[n] = std::move(t.second);
t.first->_inferRequest->CopyInputsIfNeeded();
}
workerRequestPtr->_inferRequestBatched->StartAsync();
} else if ((status == std::cv_status::timeout) && sz) {
// timeout to collect the batch is over, have to execute the requests in the batch1 mode
std::pair<AutoBatchAsyncInferRequest*, InferenceEngine::Task> t;
// popping all tasks collected by the moment of the time-out and execute each with batch1
std::atomic<int> arrived = {0};
std::promise<void> all_completed;
auto all_completed_future = all_completed.get_future();
for (int n = 0; n < sz; n++) {
IE_ASSERT(workerRequestPtr->_tasks.try_pop(t));
t.first->_inferRequestWithoutBatch->SetCallback(
[t, sz, &arrived, &all_completed](std::exception_ptr p) {
if (p)
t.first->_inferRequest->_exceptionPtr = p;
t.second();
if (sz == ++arrived)
all_completed.set_value();
});
t.first->_inferRequest->SetBlobsToAnotherRequest(t.first->_inferRequestWithoutBatch);
t.first->_inferRequestWithoutBatch->StartAsync();
}
all_completed_future.get();
// now when all the tasks for this batch are completed, start waiting for the timeout again
}
}
}
});
}
return {*_workerRequests.back(), batch_id};
}
InferenceEngine::IInferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequest() {
IInferRequestInternal::Ptr syncRequestImpl;
if (this->_plugin) {
const auto& core = _plugin->GetCore();
if (core && core->isNewAPI())
syncRequestImpl = CreateInferRequestImpl(_parameters, _results);
}
if (!syncRequestImpl)
syncRequestImpl = CreateInferRequestImpl(_networkInputs, _networkOutputs);
syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
InferenceEngine::SoIInferRequestInternal inferRequestWithoutBatch = {_networkWithoutBatch->CreateInferRequest(),
_networkWithoutBatch._so};
return std::make_shared<AutoBatchAsyncInferRequest>(
std::static_pointer_cast<AutoBatchInferRequest>(syncRequestImpl),
_needPerfCounters,
inferRequestWithoutBatch,
_callbackExecutor);
}
std::shared_ptr<ngraph::Function> AutoBatchExecutableNetwork::GetExecGraphInfo() {
return _network->GetExecGraphInfo() ? _network->GetExecGraphInfo() : _networkWithoutBatch->GetExecGraphInfo();
}
void AutoBatchExecutableNetwork::SetConfig(const std::map<std::string, InferenceEngine::Parameter>& config) {
auto timeout = config.find(CONFIG_KEY(AUTO_BATCH_TIMEOUT));
if (timeout == config.end() || config.size() > 1) {
IE_THROW() << "The only config that can be changed on the fly for the AutoBatching the is the "
<< CONFIG_KEY(AUTO_BATCH_TIMEOUT);
} else {
_timeOut = ParseTimeoutValue(timeout->second.as<std::string>());
}
}
InferenceEngine::Parameter AutoBatchExecutableNetwork::GetConfig(const std::string& name) const {
auto it = _config.find(name);
if (it != _config.end()) {
return it->second;
} else {
// find config key among networks config keys
auto param = _network->GetMetric(METRIC_KEY(SUPPORTED_CONFIG_KEYS));
for (auto&& configKey : param.as<std::vector<std::string>>()) {
if (configKey == name) {
return _network->GetConfig(configKey);
}
}
IE_THROW(NotFound) << name << " not found in the ExecutableNetwork config";
}
}
InferenceEngine::Parameter AutoBatchExecutableNetwork::GetMetric(const std::string& name) const {
if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) {
auto reqs = 0;
try {
auto hint = _network->GetConfig(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as<std::string>();
reqs = InferenceEngine::PerfHintsConfig::CheckPerformanceHintRequestValue(hint);
if (!reqs) // no limitations from user, let's deduce the full blown #requests
// (multiplied by the devices capabilities to run multiple <batched> requests for further perf)
reqs = _device.batchForDevice *
_network->GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as<unsigned int>();
} catch (const InferenceEngine::Exception& iie) {
}
reqs = std::max(reqs, _device.batchForDevice); // round up to the possible user's value
IE_SET_METRIC_RETURN(OPTIMAL_NUMBER_OF_INFER_REQUESTS, reqs);
} else if (name == METRIC_KEY(NETWORK_NAME)) {
IE_SET_METRIC_RETURN(NETWORK_NAME, _network->GetMetric(METRIC_KEY(NETWORK_NAME)).as<std::string>());
} else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
IE_SET_METRIC_RETURN(SUPPORTED_METRICS,
{METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS),
METRIC_KEY(SUPPORTED_METRICS),
METRIC_KEY(NETWORK_NAME),
METRIC_KEY(SUPPORTED_CONFIG_KEYS)});
} else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS,
{CONFIG_KEY(AUTO_BATCH_TIMEOUT)}); // only timeout can be changed on the fly
} else {
IE_THROW() << "Unsupported Network metric: " << name;
}
}
// ------------------------------AutoBatchInferencePlugin----------------------------
namespace {
std::map<std::string, std::string> mergeConfigs(std::map<std::string, std::string> config,
const std::map<std::string, std::string>& local) {
for (auto&& kvp : local) {
config[kvp.first] = kvp.second;
}
return config;
}
} // namespace
DeviceInformation AutoBatchInferencePlugin::ParseBatchDevice(const std::string& deviceWithBatch) {
auto&& d = deviceWithBatch;
auto openingBracket = d.find_first_of('(');
auto closingBracket = d.find_first_of(')', openingBracket);
auto deviceName = d.substr(0, openingBracket);
int batch = 0;
if (closingBracket != std::string::npos && openingBracket < closingBracket) {
batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1));
if (batch <= 0) {
IE_THROW() << "Batch value for '" << deviceName << "' must be > 0, while " << batch << "is passed";
}
}
return {deviceName, {{}}, batch};
}
DeviceInformation AutoBatchInferencePlugin::ParseMetaDevice(const std::string& devicesBatchCfg,
const std::map<std::string, std::string>& config) const {
auto getDeviceConfig = [&](const DeviceName& deviceWithID) {
DeviceIDParser deviceParser(deviceWithID);
std::string deviceName = deviceParser.getDeviceName();
std::map<std::string, std::string> tconfig = mergeConfigs(_config, config);
// set device ID if any
std::string deviceIDLocal = deviceParser.getDeviceID();
if (!deviceIDLocal.empty()) {
tconfig[PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal;
}
return GetCore()->GetSupportedConfig(deviceName, tconfig);
};
auto metaDevice = ParseBatchDevice(devicesBatchCfg);
metaDevice.config = getDeviceConfig(metaDevice.deviceName);
auto cfg = config;
// check that no irrelevant config-keys left
for (auto k : config) {
const auto& name = k.first;
auto found_in_supported_cfg = std::find(supported_configKeys.begin(), supported_configKeys.end(), k.first);
auto found_in_device_cfg = metaDevice.config.find(k.first);
if (found_in_device_cfg == metaDevice.config.end() && found_in_supported_cfg == supported_configKeys.end()) {
IE_THROW() << "Unsupported config key: " << name;
}
}
return metaDevice;
}
RemoteContext::Ptr AutoBatchInferencePlugin::CreateContext(const InferenceEngine::ParamMap& config) {
auto cfg = config;
auto it = cfg.find(CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG));
if (it == cfg.end())
IE_THROW() << "Value for KEY_AUTO_BATCH is not set";
auto val = it->second;
auto metaDevice = ParseMetaDevice(val, std::map<std::string, std::string>());
cfg.erase(it);
return GetCore()->CreateContext(metaDevice.deviceName, cfg);
}
Parameter AutoBatchInferencePlugin::GetConfig(const std::string& name,
const std::map<std::string, Parameter>& options) const {
if (supported_configKeys.end() != std::find(supported_configKeys.begin(), supported_configKeys.end(), name)) {
auto it = _config.find(name);
if (it == _config.end()) {
IE_THROW() << "Value for " << name << " is not set";
} else {
return {it->second};
}
} else {
IE_THROW() << "Unsupported config key: " << name;
}
}
void AutoBatchInferencePlugin::CheckConfig(const std::map<std::string, std::string>& config) {
for (auto&& kvp : config) {
const auto name = kvp.first;
const auto val = kvp.second;
if (supported_configKeys.end() == std::find(supported_configKeys.begin(), supported_configKeys.end(), name))
IE_THROW() << "Unsupported config key: " << name;
if (name == CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)) {
ParseBatchDevice(val);
} else if (name == CONFIG_KEY(AUTO_BATCH_TIMEOUT)) {
try {
auto t = std::stoi(val);
if (t < 0)
IE_THROW(ParameterMismatch);
} catch (const std::exception& e) {
IE_THROW(ParameterMismatch)
<< " Expecting unsigned int value for " << CONFIG_KEY(AUTO_BATCH_TIMEOUT) << " got " << val;
}
}
}
}
void AutoBatchInferencePlugin::SetConfig(const std::map<std::string, std::string>& config) {
CheckConfig(config);
for (auto&& kvp : config) {
_config[kvp.first] = kvp.second;
}
}
static const Version version = {{2, 1}, CI_BUILD_NUMBER, "AutoBatchPlugin"};
IE_DEFINE_PLUGIN_CREATE_FUNCTION(AutoBatchInferencePlugin, version)
AutoBatchInferencePlugin::AutoBatchInferencePlugin() {
_pluginName = "BATCH";
}
InferenceEngine::Parameter AutoBatchInferencePlugin::GetMetric(
const std::string& name,
const std::map<std::string, InferenceEngine::Parameter>& options) const {
if (name == METRIC_KEY(SUPPORTED_METRICS)) {
std::vector<std::string> metrics;
metrics.push_back(METRIC_KEY(SUPPORTED_METRICS));
metrics.push_back(METRIC_KEY(FULL_DEVICE_NAME));
metrics.push_back(METRIC_KEY(SUPPORTED_CONFIG_KEYS));
IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics);
} else if (name == METRIC_KEY(FULL_DEVICE_NAME)) {
IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, _pluginName);
} else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, supported_configKeys);
} else {
IE_THROW(NotFound) << "Unsupported metric key " << name;
}
}
IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadExeNetworkImpl(
const InferenceEngine::CNNNetwork& network,
const std::map<std::string, std::string>& config) {
return LoadNetworkImpl(network, nullptr, config);
}
InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadNetworkImpl(
const InferenceEngine::CNNNetwork& network,
const std::shared_ptr<InferenceEngine::RemoteContext> ctx,
const std::map<std::string, std::string>& config) {
if (GetCore() == nullptr) {
IE_THROW() << "Please, work with Auto-Batching device via InferencEngine::Core object";
}
auto fullConfig = mergeConfigs(_config, config);
auto device_batch = fullConfig.find(CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG));
if (device_batch == fullConfig.end()) {
IE_THROW() << "KEY_AUTO_BATCH key is not set for BATCH device";
}
auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig);
const auto& deviceName = metaDevice.deviceName;
const auto& deviceConfig = metaDevice.config;
auto config_without_autobatch = config, deviceConfigNoAutoBatch = deviceConfig;
// avoid recursive auto-batching
config_without_autobatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO);
deviceConfigNoAutoBatch[CONFIG_KEY(ALLOW_AUTO_BATCHING)] = CONFIG_VALUE(NO);
auto function = network.getFunction();
// check that the auto-batching is applicable in general
try {
// do not reshape/re-batch originally batched networks and when there are no inputs with the N* layouts
// the below code is a placeholder for the WIP (22.1) functionality
// that will check the reshaping by the batch is robust (CVS-51744)
const InputsDataMap inputInfo = network.getInputsInfo();
bool atLeastOneInputIsBatched = false;
for (const InputsDataMap::value_type& item : inputInfo) {
auto layout = item.second->getTensorDesc().getLayout();
if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
layout == InferenceEngine::Layout::NDHWC) {
if (1 != item.second->getTensorDesc().getDims()[0]) // do not reshape/re-batch batched networks
IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!";
else
atLeastOneInputIsBatched = true;
}
}
bool atLeastOneOutputIsBatched = false;
const OutputsDataMap outputInfo = network.getOutputsInfo();
for (const OutputsDataMap::value_type& item : outputInfo) {
auto layout = item.second->getTensorDesc().getLayout();
if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
layout == InferenceEngine::Layout::NDHWC) {
if (1 != item.second->getTensorDesc().getDims()[0]) // do not reshape/re-batch batched networks
IE_THROW(NotImplemented) << "Auto-batching does not reshape/re-batch originally batched networks!";
else
atLeastOneOutputIsBatched = true;
}
}
if (!atLeastOneInputIsBatched || !atLeastOneOutputIsBatched)
IE_THROW(NotImplemented)
<< "Auto-batching supports only networks featuring inputs/outputs with the batched layouts !";
} catch (...) {
// fallback to loading as if no Auto-Batching was involved
auto res = GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch);
_additionalSOPtrs.push_back(res._so);
return res._ptr;
}
if (!metaDevice.batchForDevice) {
unsigned int requests = 0;
unsigned int optimalBatchSize = 0;
// batch size is not set explicitly via device name e.g. BATCH:GPU(4)
// let's query the optimal batch size
std::map<std::string, InferenceEngine::Parameter> options;
options["MODEL_PTR"] = std::const_pointer_cast<ngraph::Function>(network.getFunction());
auto optBatchSize =
GetCore()->GetMetric(deviceName, METRIC_KEY(OPTIMAL_BATCH_SIZE), options).as<unsigned int>();
auto res = GetCore()->GetConfig(deviceName, CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS)).as<std::string>();
requests = PerfHintsConfig::CheckPerformanceHintRequestValue(res);
const auto& reqs = config.find(CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS));
if (reqs != config.end())
requests = static_cast<unsigned int>(PerfHintsConfig::CheckPerformanceHintRequestValue(reqs->second));
if (requests)
optBatchSize = std::max(1u, std::min(requests, optimalBatchSize));
metaDevice.batchForDevice = optBatchSize;
}
const auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
const auto perfConfigInTargetPlugin =
GetCore()->GetConfig(deviceName, PluginConfigParams::KEY_PERF_COUNT).as<std::string>() ==
PluginConfigParams::YES;
const bool enablePerfCounters = perfConfigInTargetPlugin || ((fullConfig.end() != perfConfig) &&
(perfConfig->second == PluginConfigParams::YES));
auto report_footprint = [](std::shared_ptr<ICore> pCore, std::string device) -> size_t {
size_t footprint = 0;
// TODO: use the per-network metric (22.2) rather than plugin-level
auto stats =
pCore->GetMetric(device, ov::intel_gpu::memory_statistics.name()).as<std::map<std::string, uint64_t>>();
for (auto s : stats)
if (s.first.find("_current") != std::string::npos)
footprint += s.second;
return footprint;
};
size_t batch1_footprint = 0;
if (deviceName.find("GPU") != std::string::npos)
batch1_footprint = report_footprint(GetCore(), deviceName);
auto executableNetworkWithoutBatch = ctx ? GetCore()->LoadNetwork(network, ctx, deviceConfigNoAutoBatch)
: GetCore()->LoadNetwork(network, deviceName, deviceConfigNoAutoBatch);
if (deviceName.find("GPU") != std::string::npos) {
batch1_footprint = report_footprint(GetCore(), deviceName) - batch1_footprint;
if (batch1_footprint) {
const uint64_t total_mem = GetCore()->GetMetric(deviceName, GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE));
const int estimated_batch = (total_mem - batch1_footprint) / batch1_footprint;
int closest = pow(2, floor(log(estimated_batch) / log(2)));
closest = std::max(1, closest);
metaDevice.batchForDevice = std::min(metaDevice.batchForDevice, closest);
}
}
// auto-batch settings
std::unordered_map<std::string, InferenceEngine::Parameter> networkConfig;
for (auto c : fullConfig) {
if (supported_configKeys.end() != std::find(supported_configKeys.begin(), supported_configKeys.end(), c.first))
networkConfig.insert(c);
}
InferenceEngine::SoExecutableNetworkInternal executableNetworkWithBatch;
if (metaDevice.batchForDevice > 1) {
try {
CNNNetwork clonedNetwork(InferenceEngine::details::cloneNetwork(network));
const InputsDataMap inputInfo = clonedNetwork.getInputsInfo();
ICNNNetwork::InputShapes shapes = clonedNetwork.getInputShapes();
for (const InputsDataMap::value_type& item : inputInfo) {
auto layout = item.second->getTensorDesc().getLayout();
// the below code is a placeholder for the WIP (22.1) functionality
// that will check the reshaping by the batch is robust (CVS-51744)
if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW ||
layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC ||
layout == InferenceEngine::Layout::NDHWC) {
assert(1 == shapes[item.first][0]); // do not reshape/re-batch originally batched networks
shapes[item.first][0] = metaDevice.batchForDevice;
}
}
clonedNetwork.reshape(shapes);
executableNetworkWithBatch =
ctx ? GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, ctx, deviceConfigNoAutoBatch)
: GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfigNoAutoBatch);
} catch (...) {
executableNetworkWithBatch = {nullptr, nullptr};
}
}
if (!executableNetworkWithBatch) {
executableNetworkWithBatch = executableNetworkWithoutBatch;
metaDevice.batchForDevice = 1;
}
return std::make_shared<AutoBatchExecutableNetwork>(executableNetworkWithBatch,
executableNetworkWithoutBatch,
metaDevice,
networkConfig,
enablePerfCounters);
}
InferenceEngine::IExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadExeNetworkImpl(
const InferenceEngine::CNNNetwork& network,
const std::shared_ptr<InferenceEngine::RemoteContext>& context,
const std::map<std::string, std::string>& config) {
return LoadNetworkImpl(network, context, config);
}
InferenceEngine::QueryNetworkResult AutoBatchInferencePlugin::QueryNetwork(
const InferenceEngine::CNNNetwork& network,
const std::map<std::string, std::string>& config) const {
auto cfg = config;
for (auto c : cfg) {
if (c.first == CONFIG_KEY(AUTO_BATCH_DEVICE_CONFIG)) {
auto val = c.second;
cfg.erase(c.first);
auto metaDevice = ParseMetaDevice(val, cfg);
return GetCore()->QueryNetwork(network, metaDevice.deviceName, cfg);
}
}
IE_THROW() << "Value for KEY_AUTO_BATCH is not set";
}
} // namespace AutoBatchPlugin