From 9357d3fd3dd4f89ad4dc86ec909836acd77451c6 Mon Sep 17 00:00:00 2001
From: myshevts <maxim.y.shevtsov@intel.com>
Date: Thu, 25 Mar 2021 15:01:30 +0300
Subject: [PATCH] auto-batching POC squashed (all commits from
 auto-batch-2021.3 branch)

(cherry picked from commit d7742f2c747bc514a126cc9a4d5b99f0ff5cbbc7)
---
 .../samples/benchmark_app/main.cpp            |  12 +-
 inference-engine/src/CMakeLists.txt           |   2 +
 .../src/auto_batch/CMakeLists.txt             |  25 +
 .../src/auto_batch/auto_batch.cpp             | 470 ++++++++++++++++++
 .../src/auto_batch/auto_batch.hpp             | 176 +++++++
 .../src/cldnn_engine/cldnn_engine.cpp         |   6 +
 .../include/ie/ie_plugin_config.hpp           |   7 +-
 inference-engine/src/plugin_api/ie_icore.hpp  |   1 +
 8 files changed, 689 insertions(+), 10 deletions(-)
 create mode 100644 inference-engine/src/auto_batch/CMakeLists.txt
 create mode 100644 inference-engine/src/auto_batch/auto_batch.cpp
 create mode 100644 inference-engine/src/auto_batch/auto_batch.hpp

diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp
index 33798d5c68f..c4a30023c70 100644
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@@ -666,14 +666,8 @@ int main(int argc, char* argv[]) {
         next_step(ss.str());
 
         // warming up - out of scope
-        auto inferRequest = inferRequestsQueue.getIdleRequest();
-        if (!inferRequest) {
-            IE_THROW() << "No idle Infer Requests!";
-        }
-        if (FLAGS_api == "sync") {
-            inferRequest->infer();
-        } else {
-            inferRequest->startAsync();
+        for (size_t i =0 ; i < inferRequestsQueue.requests.size(); i++) {
+            inferRequestsQueue.getIdleRequest()->startAsync();
         }
         inferRequestsQueue.waitAll();
         auto duration_ms = double_to_string(inferRequestsQueue.getLatencies()[0]);
@@ -694,7 +688,7 @@ int main(int argc, char* argv[]) {
         while ((niter != 0LL && iteration < niter) ||
                (duration_nanoseconds != 0LL && (uint64_t)execTime < duration_nanoseconds) ||
                (FLAGS_api == "async" && iteration % nireq != 0)) {
-            inferRequest = inferRequestsQueue.getIdleRequest();
+            auto inferRequest = inferRequestsQueue.getIdleRequest();
             if (!inferRequest) {
                 IE_THROW() << "No idle Infer Requests!";
             }
diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
index 8b198bfbf28..138aab5735b 100644
--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
@@ -32,6 +32,8 @@ add_subdirectory(hetero_plugin)
 
 add_subdirectory(multi_device)
 
+add_subdirectory(auto_batch)
+
 add_subdirectory(transformations)
 
 add_subdirectory(inference_engine)
diff --git a/inference-engine/src/auto_batch/CMakeLists.txt b/inference-engine/src/auto_batch/CMakeLists.txt
new file mode 100644
index 00000000000..f083593fbbd
--- /dev/null
+++ b/inference-engine/src/auto_batch/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "AutoBatchPlugin")
+
+if(ENABLE_LTO)
+    ie_enable_lto()
+endif()
+
+file(GLOB SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+)
+
+file(GLOB HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
+)
+
+ie_add_plugin(NAME ${TARGET_NAME}
+              DEVICE_NAME "BATCH"
+              SOURCES ${SOURCES} ${HEADERS}
+              VERSION_DEFINES_FOR auto_batch.cpp)
+
+target_link_libraries(${TARGET_NAME} PRIVATE inference_engine inference_engine_legacy)
+set_ie_threading_interface_for(${TARGET_NAME})
diff --git a/inference-engine/src/auto_batch/auto_batch.cpp b/inference-engine/src/auto_batch/auto_batch.cpp
new file mode 100644
index 00000000000..e28f4a61027
--- /dev/null
+++ b/inference-engine/src/auto_batch/auto_batch.cpp
@@ -0,0 +1,470 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <string>
+#include <vector>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "ie_metric_helpers.hpp"
+#include <cpp_interfaces/base/ie_infer_async_request_base.hpp>
+#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
+#include <legacy/ie_util_internal.hpp>
+#include <ie_plugin_config.hpp>
+#include "auto_batch.hpp"
+
+namespace AutoBatchPlugin {
+    using namespace InferenceEngine;
+
+    template <Precision::ePrecision precision>
+    Blob::Ptr create_shared_blob_on_top_of_batched_blob(Blob::Ptr batched_blob, size_t batch_id, size_t batch_num) {
+        typedef typename PrecisionTrait<precision>::value_type TYPE;
+        typedef typename std::add_pointer<TYPE>::type TYPEPTR;
+        auto ptr = batched_blob->buffer().as<TYPEPTR>();
+        auto sizePerBatch = batched_blob->size() / batch_num;
+        auto layout = batched_blob->getTensorDesc().getLayout();
+        SizeVector dims = batched_blob->getTensorDesc().getDims();
+
+        if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW
+            || layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC
+            || layout == InferenceEngine::Layout::NDHWC) {
+            dims[0] = 1;
+            assert(batched_blob->getTensorDesc().getPrecision() == precision);
+            return make_shared_blob<TYPE>({precision, dims, batched_blob->getTensorDesc().getLayout()},
+                                          ptr + sizePerBatch * batch_id, sizePerBatch);
+        } else {
+            // same blob for all requests (e.g. constants)
+            return make_shared_blob<TYPE>({precision, dims, batched_blob->getTensorDesc().getLayout()},
+                                          ptr);
+        }
+    }
+
+// ------------------------------AutoBatchInferRequest----------------------------
+AutoBatchInferRequest::AutoBatchInferRequest(const InputsDataMap&   networkInputs,
+                                             const OutputsDataMap&  networkOutputs,
+                                             AutoBatchExecutableNetwork::WorkerInferRequest* workerRequestPtr,
+                                             int batch_id, int num_batch,
+                                             bool needPerfCounters)
+        : InferRequestInternal(networkInputs, networkOutputs), _workerInferRequest(workerRequestPtr),
+        _needPerfCounters(needPerfCounters) {
+    // Allocate all input blobs
+    for (const auto &it : networkInputs) {
+        auto blob = workerRequestPtr->_inferRequest.GetBlob(it.first);
+        Blob::Ptr res;
+        switch (it.second->getTensorDesc().getPrecision()) {
+            case InferenceEngine::Precision::FP32:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::FP32>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            case InferenceEngine::Precision::I32:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I32>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            case InferenceEngine::Precision::I8:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I8>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            case InferenceEngine::Precision::U16:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U16>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+
+            case InferenceEngine::Precision::I16:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I16>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+
+                break;
+            case InferenceEngine::Precision::U8:
+            case InferenceEngine::Precision::BOOL:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U8>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            default:
+                THROW_IE_EXCEPTION << "Unsupported input precision " << it.second->getTensorDesc().getPrecision();
+        }
+        _inputs[it.first] = res;
+    }
+    // Allocate all output blobs
+    for (const auto &it : networkOutputs) {
+        auto blob = workerRequestPtr->_inferRequest.GetBlob(it.first);
+        Blob::Ptr res;
+        switch (it.second->getTensorDesc().getPrecision()) {
+            case InferenceEngine::Precision::FP32:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::FP32>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            case InferenceEngine::Precision::I32:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I32>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            case InferenceEngine::Precision::I8:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I8>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            case InferenceEngine::Precision::U16:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U16>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+
+            case InferenceEngine::Precision::I16:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I16>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+
+                break;
+            case InferenceEngine::Precision::U8:
+            case InferenceEngine::Precision::BOOL:
+                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U8>
+                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
+                break;
+            default:
+                THROW_IE_EXCEPTION << "Unsupported input precision " << it.second->getTensorDesc().getPrecision();
+        }
+        _outputs[it.first] = res;
+    }
+}
+
+void AutoBatchInferRequest::SetBlobsToAnotherRequest(InferRequest& req) {
+    // todo call Set for REMOTE BLOB
+}
+
+std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> AutoBatchInferRequest::GetPerformanceCounts() const {
+    return _perfMap;
+}
+
+void AutoBatchInferRequest::InferImpl() {
+    auto _event = _workerInferRequest->_event;
+    auto numReady = ++_workerInferRequest->_numRequestsReady;
+    if (numReady == _workerInferRequest->_batchSize) {
+        _workerInferRequest->_numRequestsReady = 0;
+        _workerInferRequest->_inferRequest.StartAsync();
+    }
+    _event.get();
+    if (_needPerfCounters) {
+        _perfMap = _workerInferRequest->_inferRequest.GetPerformanceCounts();
+    }
+}
+
+AutoBatchAsyncInferRequest::AutoBatchAsyncInferRequest(
+    const AutoBatchInferRequest::Ptr&           inferRequest,
+    const bool                                  needPerfCounters,
+    const AutoBatchExecutableNetwork::Ptr&      autoBatchExecutableNetwork,
+    const ITaskExecutor::Ptr&                   callbackExecutor) :
+    AsyncInferRequestThreadSafeDefault(inferRequest,
+            std::make_shared<CPUStreamsExecutor>(
+                    IStreamsExecutor::Config{"AutoBatch", 1, 1,
+                                             IStreamsExecutor::ThreadBindingType::NONE, 1, 0, 1}),
+            callbackExecutor),
+    _AutoBatchExecutableNetwork{autoBatchExecutableNetwork},
+    _inferRequest{inferRequest} {
+  }
+
+void AutoBatchAsyncInferRequest::Infer_ThreadUnsafe() {
+    InferUsingAsync();
+}
+
+AutoBatchAsyncInferRequest::~AutoBatchAsyncInferRequest() {
+    StopAndWait();
+}
+
+// ------------------------------AutoBatchExecutableNetwork----------------------------
+AutoBatchExecutableNetwork::AutoBatchExecutableNetwork(const InferenceEngine::ExecutableNetwork&    networkForDevice,
+                                                           const DeviceInformation&                 networkDevice,
+                                                           const std::unordered_map<std::string, InferenceEngine::Parameter>&   config,
+                                                           const bool                                                           needPerfCounters) :
+    InferenceEngine::ExecutableNetworkThreadSafeDefault(
+            nullptr,
+            std::make_shared<InferenceEngine::ImmediateExecutor>()),
+    _device{networkDevice},
+    _network{networkForDevice},
+    _config{config},
+    _needPerfCounters{needPerfCounters} {
+}
+
+AutoBatchExecutableNetwork::~AutoBatchExecutableNetwork() {
+//    {
+//        std::lock_guard<std::mutex> lock(_mutex);
+//        _device = {};
+//    }
+    _terminate = true;
+    /* NOTE: The only threads that use `AutoBatchExecutableNetwork` Context are those that are used by Worker infer requests.
+     *       But AsyncInferRequest destructor should waits for all asynchronous tasks that are used by the request
+     */
+    _workerRequests.clear();
+}
+
+InferenceEngine::InferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
+                                                                                                InferenceEngine::OutputsDataMap networkOutputs) {
+        // todo : guard request creation from another thread/on-the-fly
+        auto num = _numRequestsCreated++;
+        auto batch_id = num % _device.batchForDevice;
+        if (!batch_id) {  //need new request
+            _workerRequests.push_back(std::make_shared<WorkerInferRequest>());
+            auto workerRequestPtr = _workerRequests.back();
+            workerRequestPtr->_inferRequest = _network.CreateInferRequest();
+            workerRequestPtr->_batchSize = _device.batchForDevice;
+            workerRequestPtr->_cond = std::promise<void>();
+            workerRequestPtr->_event = workerRequestPtr->_cond.get_future().share();
+            // _idleWorkerRequests.push(workerRequestPtr);
+            workerRequestPtr->_inferRequest.SetCompletionCallback<std::function<void(InferRequest, StatusCode)>>(
+                [workerRequestPtr, this] (InferRequest , StatusCode status) mutable {
+                    workerRequestPtr->_status = status;
+                    auto signal = std::move(workerRequestPtr->_cond);
+                    // reset the promise/future for next use
+                    workerRequestPtr->_cond = std::promise<void>();
+                    workerRequestPtr->_event = workerRequestPtr->_cond.get_future().share();
+                    signal.set_value();
+                });
+       }
+    return std::make_shared<AutoBatchInferRequest>(networkInputs, networkOutputs, _workerRequests.back().get(),
+            batch_id, _device.batchForDevice, _needPerfCounters);
+}
+
+InferenceEngine::IInferRequest::Ptr AutoBatchExecutableNetwork::CreateInferRequest() {
+    auto syncRequestImpl = CreateInferRequestImpl(_networkInputs, _networkOutputs);
+    syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
+    auto asyncTreadSafeImpl = std::make_shared<AutoBatchAsyncInferRequest>(std::static_pointer_cast<AutoBatchInferRequest>(syncRequestImpl),
+                                                                             _needPerfCounters,
+                                                                             std::static_pointer_cast<AutoBatchExecutableNetwork>(shared_from_this()),
+                                                                             _callbackExecutor);
+    IInferRequest::Ptr asyncRequest;
+    asyncRequest.reset(new InferRequestBase(asyncTreadSafeImpl), [](IInferRequest* p) { p->Release(); });
+    asyncTreadSafeImpl->SetPointerToPublicInterface(asyncRequest);
+    return asyncRequest;
+}
+
+void AutoBatchExecutableNetwork::SetConfig(const std::map<std::string, InferenceEngine::Parameter> &config) {
+    // TODO
+    THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+}
+
+InferenceEngine::Parameter AutoBatchExecutableNetwork::GetConfig(const std::string &name) const {
+    auto res = _config.find(name);
+    if (res != _config.end()) {
+        return res->second;
+    } else {
+        THROW_IE_EXCEPTION << NOT_FOUND_str << name <<" not found in the ExecutableNetwork config";
+    }
+}
+
+InferenceEngine::Parameter AutoBatchExecutableNetwork::GetMetric(const std::string &name) const {
+    if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) {
+        unsigned int res = 0u;
+        try {
+            res = _network.GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as<unsigned int>();
+        } catch (const details::InferenceEngineException &iie) {
+            THROW_IE_EXCEPTION
+                    << "Every device used with the Auto-Batching should "
+                    << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. "
+                    << "Failed to query the metric for the "
+                    << _network.GetMetric(METRIC_KEY(FULL_DEVICE_NAME)).as<std::string>()
+                    << " with error:" << iie.what();
+        }
+        IE_SET_METRIC_RETURN(OPTIMAL_NUMBER_OF_INFER_REQUESTS, res * _device.batchForDevice);
+    } else if (name == METRIC_KEY(NETWORK_NAME)) {
+        IE_SET_METRIC_RETURN(NETWORK_NAME, _network.GetMetric(
+                METRIC_KEY(NETWORK_NAME)).as<std::string>());
+    } else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
+        IE_SET_METRIC_RETURN(SUPPORTED_METRICS, {
+            METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS),
+            METRIC_KEY(SUPPORTED_METRICS),
+            METRIC_KEY(NETWORK_NAME),
+            METRIC_KEY(SUPPORTED_CONFIG_KEYS)
+        });
+    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
+        std::vector<std::string> configKeys = { CONFIG_KEY(AUTO_BATCH) };
+        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
+    } else {
+        THROW_IE_EXCEPTION << "Unsupported Network metric: " << name;
+    }
+}
+
+// ------------------------------AutoBatchInferencePlugin----------------------------
+
+namespace {
+
+std::map<std::string, std::string> mergeConfigs(std::map<std::string, std::string> config,
+                                                const std::map<std::string, std::string> & local) {
+    for (auto && kvp : local) {
+        config[kvp.first] = kvp.second;
+    }
+    return config;
+}
+
+}  // namespace
+
+std::map<std::string, std::string> AutoBatchInferencePlugin::GetSupportedConfig(
+    const std::map<std::string, std::string> & config, const std::string & deviceName) const {
+    std::vector<std::string> supportedConfigKeys = GetCore()->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+    std::map<std::string, std::string> supportedConfig;
+    for (auto&& key : supportedConfigKeys) {
+        auto itKey = config.find(key);
+        if (config.end() != itKey) {
+            supportedConfig[key] = itKey->second;
+        }
+    }
+    return supportedConfig;
+}
+
+DeviceInformation AutoBatchInferencePlugin::ParseMetaDevice(const std::string& devicesBatchCfg,
+                                                                          const std::map<std::string, std::string> & config) const {
+    DeviceInformation metaDevice;
+    auto getDeviceConfig = [&] (const DeviceName & deviceWithID) {
+        DeviceIDParser deviceParser(deviceWithID);
+        std::string deviceName = deviceParser.getDeviceName();
+        std::map<std::string, std::string> tconfig = mergeConfigs(_config, config);
+
+        // set device ID if any
+        std::string deviceIDLocal = deviceParser.getDeviceID();
+        if (!deviceIDLocal.empty()) {
+            tconfig[PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal;
+        }
+
+        return GetSupportedConfig(tconfig, deviceName);
+    };
+
+    auto && d = devicesBatchCfg;
+    {
+        auto openingBracket = d.find_first_of('(');
+        auto closingBracket = d.find_first_of(')', openingBracket);
+        auto deviceName = d.substr(0, openingBracket);
+
+        int batch = -1;
+        if (closingBracket != std::string::npos && openingBracket < closingBracket) {
+            batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1));
+
+            if (batch <= 0) {
+                THROW_IE_EXCEPTION << "Batch value for '" << deviceName << "' must be > 0, while " << batch
+                    << "is passed";
+            }
+        }
+
+        // create meta device
+        auto cfg = getDeviceConfig(deviceName);
+        std::vector<std::string> supportedConfigKeys = GetCore()->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+        if (std::find(std::begin(supportedConfigKeys), std::end(supportedConfigKeys), CONFIG_KEY_INTERNAL(AGGREGATED_PLUGIN))
+            != std::end(supportedConfigKeys)) {
+            cfg.emplace(CONFIG_KEY_INTERNAL(AGGREGATED_PLUGIN), "");
+        }
+        metaDevice = { deviceName, cfg, batch };
+    }
+
+    return metaDevice;
+}
+
+Parameter AutoBatchInferencePlugin::GetConfig(const std::string& name,
+        const std::map<std::string, Parameter> & options) const {
+    if (name == CONFIG_KEY(AUTO_BATCH)) {
+        auto it = _config.find(CONFIG_KEY(AUTO_BATCH));
+        if (it == _config.end()) {
+            THROW_IE_EXCEPTION << "Value for KEY_AUTO_BATCH is not set";
+        } else {
+            return { it->second };
+        }
+    } else {
+        THROW_IE_EXCEPTION << "Unsupported config key: " << name;
+    }
+}
+
+void AutoBatchInferencePlugin::SetConfig(const std::map<std::string, std::string> & config) {
+    for (auto && kvp : config) {
+        _config[kvp.first] = kvp.second;
+    }
+}
+
+static const Version version = {{2, 1}, CI_BUILD_NUMBER, "AutoBatchPlugin"};
+IE_DEFINE_PLUGIN_CREATE_FUNCTION(AutoBatchInferencePlugin, version)
+
+AutoBatchInferencePlugin::AutoBatchInferencePlugin() {
+    _pluginName = "BATCH";
+}
+
+InferenceEngine::Parameter AutoBatchInferencePlugin::GetMetric(const std::string& name,
+                                         const std::map<std::string, InferenceEngine::Parameter> & options) const {
+    if (name == METRIC_KEY(SUPPORTED_METRICS)) {
+        std::vector<std::string> metrics;
+        metrics.push_back(METRIC_KEY(SUPPORTED_METRICS));
+        metrics.push_back(METRIC_KEY(FULL_DEVICE_NAME));
+        metrics.push_back(METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+        IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics);
+    } else if (name == METRIC_KEY(FULL_DEVICE_NAME)) {
+        std::string name = { "BATCH" };
+        IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, name);
+    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
+        std::vector<std::string> configKeys = {
+            CONFIG_KEY_INTERNAL(AGGREGATED_PLUGIN)};
+        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
+    } else {
+        THROW_IE_EXCEPTION << "Unsupported metric key " << name;
+    }
+}
+
+ExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork&network,
+                                                                              const std::map<std::string, std::string>& config) {
+    if (GetCore() == nullptr) {
+        THROW_IE_EXCEPTION << "Please, work with MULTI device via InferencEngine::Core object";
+    }
+
+    auto fullConfig = mergeConfigs(_config, config);
+    auto device_batch = fullConfig.find(CONFIG_KEY(AUTO_BATCH));
+    if (device_batch == fullConfig.end()) {
+        THROW_IE_EXCEPTION << "KEY_AUTO_BATCH key is not set for BATCH device";
+    }
+
+    auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig);
+
+    // collect the settings that are applicable to the devices we are loading the network to
+    std::unordered_map<std::string, InferenceEngine::Parameter> networkConfig;
+    networkConfig.insert(*device_batch);
+
+    ExecutableNetwork executableNetworkForDevice;
+    auto & deviceName = metaDevice.deviceName;
+    auto & deviceConfig = metaDevice.config;
+    // network.serialize("out_orig.xml", "out_orig.bin");
+
+    CNNNetwork clonedNetwork(InferenceEngine::cloneNetwork(network));
+    const InputsDataMap inputInfo = clonedNetwork.getInputsInfo();
+    ICNNNetwork::InputShapes shapes = clonedNetwork.getInputShapes();
+
+    for (const InputsDataMap::value_type &item : inputInfo) {
+        auto layout = item.second->getTensorDesc().getLayout();
+        if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW
+                || layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC
+                || layout == InferenceEngine::Layout::NDHWC) {
+            shapes[item.first][0] = metaDevice.batchForDevice;
+            std::cout << "  reshaping the input " << item.first << " (layout " << layout << ")" << " by the batch" << std::endl;
+        }
+    }
+    std::cout << "Reshaped network by batch to  " << metaDevice.batchForDevice << std::endl;
+    clonedNetwork.reshape(shapes);
+    // clonedNetwork.serialize("out_batch4.xml", "out_batch4.bin");
+
+    std::map<std::string, std::string> deviceConfig0 = deviceConfig;
+    // deviceConfig0["DO_NOT_AUTO_BATCH"] = "TRUE";
+    executableNetworkForDevice = GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig0);
+    networkConfig.insert(deviceConfig.begin(), deviceConfig.end());
+    if ((std::shared_ptr<InferenceEngine::IExecutableNetwork>)executableNetworkForDevice == nullptr)
+        THROW_IE_EXCEPTION << NOT_FOUND_str << "Failed to load Executable network the device "
+                                            <<  "that the BATCH device is initialized to work with";
+
+    auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
+    bool enablePerfCounters = (fullConfig.end() != perfConfig) && (perfConfig->second == PluginConfigParams::YES);
+
+    return std::make_shared<AutoBatchExecutableNetwork>(executableNetworkForDevice,
+                                                          metaDevice,
+                                                          networkConfig,
+                                                          enablePerfCounters);
+}
+
+InferenceEngine::QueryNetworkResult AutoBatchInferencePlugin::QueryNetwork(const InferenceEngine::CNNNetwork& network,
+                                              const std::map<std::string, std::string>& config) const {
+//    THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
+    const std::map<std::string, std::string> cfg;
+    return GetCore()->QueryNetwork(network, "CPU", cfg);
+}
+}  // namespace AutoBatchPlugin
diff --git a/inference-engine/src/auto_batch/auto_batch.hpp b/inference-engine/src/auto_batch/auto_batch.hpp
new file mode 100644
index 00000000000..a09d370b57a
--- /dev/null
+++ b/inference-engine/src/auto_batch/auto_batch.hpp
@@ -0,0 +1,176 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <queue>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <utility>
+#include <memory>
+#include <string>
+
+#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include <cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp>
+#include "ie_iinfer_request.hpp"
+#include "details/ie_exception_conversion.hpp"
+#include <ie_parallel.hpp>
+
+#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
+# include <tbb/concurrent_queue.h>
+#endif
+
+namespace AutoBatchPlugin {
+
+using DeviceName = std::string;
+
+struct DeviceInformation {
+    DeviceName deviceName;
+    std::map<std::string, std::string> config;
+    int batchForDevice;
+};
+
+#if ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO))
+template <typename T>
+using ThreadSafeQueue = tbb::concurrent_queue<T>;
+#else
+template <typename T>
+class ThreadSafeQueue {
+public:
+    void push(T value) {
+        std::lock_guard<std::mutex> lock(_mutex);
+        _queue.push(std::move(value));
+    }
+
+    bool try_pop(T& value) {
+        std::lock_guard<std::mutex> lock(_mutex);
+        if (!_queue.empty()) {
+            value = std::move(_queue.front());
+            _queue.pop();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    bool empty() {
+        std::lock_guard<std::mutex> lock(_mutex);
+        return _queue.empty();
+    }
+
+protected:
+    std::queue<T>   _queue;
+    std::mutex      _mutex;
+};
+#endif
+
+class AutoBatchAsyncInferRequest;
+class AutoBatchExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafeDefault {
+public:
+    using Ptr = std::shared_ptr<AutoBatchExecutableNetwork>;
+    struct WorkerInferRequest {
+        using Ptr = std::shared_ptr<WorkerInferRequest>;
+        InferenceEngine::InferRequest   _inferRequest;
+        InferenceEngine::StatusCode     _status = InferenceEngine::StatusCode::OK;
+        int                             _batchSize;
+        std::promise<void>              _cond;
+        std::shared_future<void>        _event;
+        std::atomic_int                 _numRequestsReady = {0};
+        void ReportArrival() {
+            _numRequestsReady++;
+            if (_numRequestsReady == _batchSize) {
+                _numRequestsReady = 0;
+                _inferRequest.StartAsync();
+            }
+            // workerRequestPtr->_cond.
+        }
+    };
+    using NotBusyWorkerRequests = ThreadSafeQueue<WorkerInferRequest*>;
+
+    explicit AutoBatchExecutableNetwork(const InferenceEngine::ExecutableNetwork&                 networkForDevice,
+                                          const DeviceInformation&                                 networkDevices,
+                                          const std::unordered_map<std::string, InferenceEngine::Parameter>&    config,
+                                          const bool                                                            needPerfCounters = false);
+
+    void SetConfig(const std::map<std::string, InferenceEngine::Parameter> &config) override;
+    InferenceEngine::Parameter GetConfig(const std::string &name) const override;
+    InferenceEngine::Parameter GetMetric(const std::string &name) const override;
+    InferenceEngine::IInferRequest::Ptr CreateInferRequest() override;
+    InferenceEngine::InferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
+                                                                      InferenceEngine::OutputsDataMap networkOutputs) override;
+    ~AutoBatchExecutableNetwork() override;
+
+    std::atomic_bool                                            _terminate = {false};
+    DeviceInformation                                           _device;
+    InferenceEngine::ExecutableNetwork                          _network;
+    std::vector<WorkerInferRequest::Ptr>                        _workerRequests;
+    std::unordered_map<std::string, InferenceEngine::Parameter> _config;
+    bool                                                        _needPerfCounters = false;
+    std::atomic_size_t                                          _numRequestsCreated = {0};
+};
+
+class AutoBatchInferRequest : public InferenceEngine::InferRequestInternal {
+public:
+    using Ptr = std::shared_ptr<AutoBatchInferRequest>;
+    explicit AutoBatchInferRequest(const InferenceEngine::InputsDataMap&  networkInputs,
+                                   const InferenceEngine::OutputsDataMap& networkOutputs,
+                                   AutoBatchExecutableNetwork::WorkerInferRequest* workerRequestPtr,
+                                   int batch_id, int num_batch, bool _needPerfCounters = false);
+    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> GetPerformanceCounts() const override;
+    void InferImpl() override;
+
+    // Batch-Device impl specific: sets the data (blobs from the device request to the batched device request)
+    void SetBlobsToAnotherRequest(InferenceEngine::InferRequest& req);
+    AutoBatchExecutableNetwork::WorkerInferRequest* _workerInferRequest;
+protected:
+    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>  _perfMap;
+    bool                                                                _needPerfCounters = false;
+};
+
+class AutoBatchAsyncInferRequest : public InferenceEngine::AsyncInferRequestThreadSafeDefault {
+public:
+    using Ptr = std::shared_ptr<AutoBatchAsyncInferRequest>;
+
+    explicit AutoBatchAsyncInferRequest(const AutoBatchInferRequest::Ptr&           inferRequest,
+                                          const bool                                needPerfCounters,
+                                          const AutoBatchExecutableNetwork::Ptr&      AutoBatchExecutableNetwork,
+                                          const InferenceEngine::ITaskExecutor::Ptr&    callbackExecutor);
+    void Infer_ThreadUnsafe() override;
+    ~AutoBatchAsyncInferRequest() override;
+
+protected:
+    AutoBatchExecutableNetwork::Ptr                                   _AutoBatchExecutableNetwork;
+    AutoBatchInferRequest::Ptr                                        _inferRequest;
+};
+
+class AutoBatchInferencePlugin : public InferenceEngine::InferencePluginInternal {
+public:
+    AutoBatchInferencePlugin();
+    ~AutoBatchInferencePlugin() override = default;
+
+    InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(const InferenceEngine::CNNNetwork& network,
+                                                                       const std::map<std::string, std::string>& config) override;
+
+    void SetConfig(const std::map<std::string, std::string>& config) override;
+    InferenceEngine::Parameter GetConfig(const std::string& name,
+                        const std::map<std::string, InferenceEngine::Parameter> & options) const override;
+    InferenceEngine::QueryNetworkResult QueryNetwork(const InferenceEngine::CNNNetwork&       network,
+                      const std::map<std::string, std::string>& config) const override;
+    InferenceEngine::Parameter GetMetric(const std::string& name,
+                                         const std::map<std::string, InferenceEngine::Parameter>& options) const override;
+
+    DeviceInformation ParseMetaDevice(const std::string & devicesBatchCfg,
+                                                  const std::map<std::string, std::string> & config) const;
+
+protected:
+    std::map<std::string, std::string> GetSupportedConfig(const std::map<std::string, std::string>& config,
+                                                          const DeviceName & deviceName) const;
+};
+
+}  // namespace AutoBatchPlugin
diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
index 863c8079b6b..93fb9e59f7b 100644
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -677,6 +677,12 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
               << static_cast<int>(device_info.gfx_ver.revision);
         }
         IE_SET_METRIC_RETURN(GPU_UARCH_VERSION, s.str());
+    } else if (name == METRIC_KEY(OPTIMAL_BATCH)) {
+        auto network = options.find("MODEL_ADDRESS")->second.as<InferenceEngine::CNNNetwork const*>();
+        // auto transformedNetwork = CloneAndTransformNetwork(*network, _impl->m_config);
+        unsigned int batch = 8;
+        std::cout << "SELECTED BATCH: " << batch << std::endl;
+        IE_SET_METRIC_RETURN(OPTIMAL_BATCH, batch);    
     } else if (name == METRIC_KEY(FULL_DEVICE_NAME)) {
         auto deviceName = StringRightTrim(device_info.dev_name, "NEO", false);
         deviceName += std::string(" (") + (device_info.dev_type == cldnn::device_type::discrete_gpu ? "dGPU" : "iGPU") + ")";
diff --git a/inference-engine/src/inference_engine/include/ie/ie_plugin_config.hpp b/inference-engine/src/inference_engine/include/ie/ie_plugin_config.hpp
index 09f62301f7e..be65df32a28 100644
--- a/inference-engine/src/inference_engine/include/ie/ie_plugin_config.hpp
+++ b/inference-engine/src/inference_engine/include/ie/ie_plugin_config.hpp
@@ -118,6 +118,7 @@ DECLARE_METRIC_VALUE(BATCHED_BLOB);
  * String value for metric name is "RANGE_FOR_STREAMS".
  */
 DECLARE_METRIC_KEY(RANGE_FOR_STREAMS, std::tuple<unsigned int, unsigned int>);
+DECLARE_METRIC_KEY(OPTIMAL_BATCH, unsigned int);
 
 /**
  * @brief Metric to provide a hint for a range for number of async infer requests. If device supports streams,
@@ -250,6 +251,11 @@ DECLARE_CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS);
 DECLARE_CONFIG_VALUE(YES);
 DECLARE_CONFIG_VALUE(NO);
 
+/**
+ * @brief Auto-batching to the `#batch`.
+ */
+DECLARE_CONFIG_KEY(AUTO_BATCH);
+
 /**
  * @brief Limit `#threads` that are used by Inference Engine for inference on the CPU.
  */
@@ -312,7 +318,6 @@ DECLARE_CONFIG_KEY(PERF_COUNT);
  * >0 - Direct value of limit. Batch size to process is min(new batch_limit, original_batch)
  */
 DECLARE_CONFIG_KEY(DYN_BATCH_LIMIT);
-
 /**
  * @brief The key checks whether dynamic batch is enabled.
  */
diff --git a/inference-engine/src/plugin_api/ie_icore.hpp b/inference-engine/src/plugin_api/ie_icore.hpp
index d863fbded9d..155577a5d2a 100644
--- a/inference-engine/src/plugin_api/ie_icore.hpp
+++ b/inference-engine/src/plugin_api/ie_icore.hpp
@@ -169,6 +169,7 @@ public:
 
     static std::vector<std::string> getHeteroDevices(std::string fallbackDevice);
     static std::vector<std::string> getMultiDevices(std::string devicesList);
+    static std::string getBatchDevice(std::string devicesList);
 };
 
 }  // namespace InferenceEngine