auto-batching POC squashed (all commits from auto-batch-2021.3 branch)

(cherry picked from commit d7742f2c747bc514a126cc9a4d5b99f0ff5cbbc7)
2021-03-25 15:01:30 +03:00 · 2021-03-25 15:01:30 +03:00 · 9357d3fd3d
commit 9357d3fd3d
parent f8439eeed8
8 changed files with 689 additions and 10 deletions
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@ -666,14 +666,8 @@ int main(int argc, char* argv[]) {
        next_step(ss.str());
        // warming up - out of scope
-        auto inferRequest = inferRequestsQueue.getIdleRequest();
+        for (size_t i =0 ; i < inferRequestsQueue.requests.size(); i++) {
-        if (!inferRequest) {
+            inferRequestsQueue.getIdleRequest()->startAsync();
            IE_THROW() << "No idle Infer Requests!";
        }
        if (FLAGS_api == "sync") {
            inferRequest->infer();
        } else {
            inferRequest->startAsync();
        }
        inferRequestsQueue.waitAll();
        auto duration_ms = double_to_string(inferRequestsQueue.getLatencies()[0]);
@ -694,7 +688,7 @@ int main(int argc, char* argv[]) {
        while ((niter != 0LL && iteration < niter) ||
               (duration_nanoseconds != 0LL && (uint64_t)execTime < duration_nanoseconds) ||
               (FLAGS_api == "async" && iteration % nireq != 0)) {
-            inferRequest = inferRequestsQueue.getIdleRequest();
+            auto inferRequest = inferRequestsQueue.getIdleRequest();
            if (!inferRequest) {
                IE_THROW() << "No idle Infer Requests!";
            }
--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
@ -32,6 +32,8 @@ add_subdirectory(hetero_plugin)
 add_subdirectory(multi_device)
 add_subdirectory(auto_batch)
 add_subdirectory(transformations)
 add_subdirectory(inference_engine)
--- a/inference-engine/src/auto_batch/CMakeLists.txt
+++ b/inference-engine/src/auto_batch/CMakeLists.txt
@ -0,0 +1,25 @@
 # Copyright (C) 2018-2020 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 set (TARGET_NAME "AutoBatchPlugin")
 if(ENABLE_LTO)
    ie_enable_lto()
 endif()
 file(GLOB SOURCES
    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
 )
 file(GLOB HEADERS
    ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
 )
 ie_add_plugin(NAME ${TARGET_NAME}
              DEVICE_NAME "BATCH"
              SOURCES ${SOURCES} ${HEADERS}
              VERSION_DEFINES_FOR auto_batch.cpp)
 target_link_libraries(${TARGET_NAME} PRIVATE inference_engine inference_engine_legacy)
 set_ie_threading_interface_for(${TARGET_NAME})
--- a/inference-engine/src/auto_batch/auto_batch.cpp
+++ b/inference-engine/src/auto_batch/auto_batch.cpp
@ -0,0 +1,470 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <string>
 #include <vector>
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <map>
 #include <unordered_map>
 #include <unordered_set>
 #include "ie_metric_helpers.hpp"
 #include <cpp_interfaces/base/ie_infer_async_request_base.hpp>
 #include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
 #include <legacy/ie_util_internal.hpp>
 #include <ie_plugin_config.hpp>
 #include "auto_batch.hpp"
 namespace AutoBatchPlugin {
    using namespace InferenceEngine;
    template <Precision::ePrecision precision>
    Blob::Ptr create_shared_blob_on_top_of_batched_blob(Blob::Ptr batched_blob, size_t batch_id, size_t batch_num) {
        typedef typename PrecisionTrait<precision>::value_type TYPE;
        typedef typename std::add_pointer<TYPE>::type TYPEPTR;
        auto ptr = batched_blob->buffer().as<TYPEPTR>();
        auto sizePerBatch = batched_blob->size() / batch_num;
        auto layout = batched_blob->getTensorDesc().getLayout();
        SizeVector dims = batched_blob->getTensorDesc().getDims();
        if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW
            || layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC
            || layout == InferenceEngine::Layout::NDHWC) {
            dims[0] = 1;
            assert(batched_blob->getTensorDesc().getPrecision() == precision);
            return make_shared_blob<TYPE>({precision, dims, batched_blob->getTensorDesc().getLayout()},
                                          ptr + sizePerBatch * batch_id, sizePerBatch);
        } else {
            // same blob for all requests (e.g. constants)
            return make_shared_blob<TYPE>({precision, dims, batched_blob->getTensorDesc().getLayout()},
                                          ptr);
        }
    }
 // ------------------------------AutoBatchInferRequest----------------------------
 AutoBatchInferRequest::AutoBatchInferRequest(const InputsDataMap&   networkInputs,
                                             const OutputsDataMap&  networkOutputs,
                                             AutoBatchExecutableNetwork::WorkerInferRequest* workerRequestPtr,
                                             int batch_id, int num_batch,
                                             bool needPerfCounters)
        : InferRequestInternal(networkInputs, networkOutputs), _workerInferRequest(workerRequestPtr),
        _needPerfCounters(needPerfCounters) {
    // Allocate all input blobs
    for (const auto &it : networkInputs) {
        auto blob = workerRequestPtr->_inferRequest.GetBlob(it.first);
        Blob::Ptr res;
        switch (it.second->getTensorDesc().getPrecision()) {
            case InferenceEngine::Precision::FP32:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::FP32>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::I32:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I32>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::I8:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I8>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::U16:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U16>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::I16:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I16>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::U8:
            case InferenceEngine::Precision::BOOL:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U8>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            default:
                THROW_IE_EXCEPTION << "Unsupported input precision " << it.second->getTensorDesc().getPrecision();
        }
        _inputs[it.first] = res;
    }
    // Allocate all output blobs
    for (const auto &it : networkOutputs) {
        auto blob = workerRequestPtr->_inferRequest.GetBlob(it.first);
        Blob::Ptr res;
        switch (it.second->getTensorDesc().getPrecision()) {
            case InferenceEngine::Precision::FP32:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::FP32>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::I32:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I32>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::I8:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I8>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::U16:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U16>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::I16:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::I16>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            case InferenceEngine::Precision::U8:
            case InferenceEngine::Precision::BOOL:
                res = create_shared_blob_on_top_of_batched_blob<InferenceEngine::Precision::U8>
                        (workerRequestPtr->_inferRequest.GetBlob(it.first), batch_id, num_batch);
                break;
            default:
                THROW_IE_EXCEPTION << "Unsupported input precision " << it.second->getTensorDesc().getPrecision();
        }
        _outputs[it.first] = res;
    }
 }
 void AutoBatchInferRequest::SetBlobsToAnotherRequest(InferRequest& req) {
    // todo call Set for REMOTE BLOB
 }
 std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> AutoBatchInferRequest::GetPerformanceCounts() const {
    return _perfMap;
 }
 void AutoBatchInferRequest::InferImpl() {
    auto _event = _workerInferRequest->_event;
    auto numReady = ++_workerInferRequest->_numRequestsReady;
    if (numReady == _workerInferRequest->_batchSize) {
        _workerInferRequest->_numRequestsReady = 0;
        _workerInferRequest->_inferRequest.StartAsync();
    }
    _event.get();
    if (_needPerfCounters) {
        _perfMap = _workerInferRequest->_inferRequest.GetPerformanceCounts();
    }
 }
 AutoBatchAsyncInferRequest::AutoBatchAsyncInferRequest(
    const AutoBatchInferRequest::Ptr&           inferRequest,
    const bool                                  needPerfCounters,
    const AutoBatchExecutableNetwork::Ptr&      autoBatchExecutableNetwork,
    const ITaskExecutor::Ptr&                   callbackExecutor) :
    AsyncInferRequestThreadSafeDefault(inferRequest,
            std::make_shared<CPUStreamsExecutor>(
                    IStreamsExecutor::Config{"AutoBatch", 1, 1,
                                             IStreamsExecutor::ThreadBindingType::NONE, 1, 0, 1}),
            callbackExecutor),
    _AutoBatchExecutableNetwork{autoBatchExecutableNetwork},
    _inferRequest{inferRequest} {
  }
 void AutoBatchAsyncInferRequest::Infer_ThreadUnsafe() {
    InferUsingAsync();
 }
 AutoBatchAsyncInferRequest::~AutoBatchAsyncInferRequest() {
    StopAndWait();
 }
 // ------------------------------AutoBatchExecutableNetwork----------------------------
 AutoBatchExecutableNetwork::AutoBatchExecutableNetwork(const InferenceEngine::ExecutableNetwork&    networkForDevice,
                                                           const DeviceInformation&                 networkDevice,
                                                           const std::unordered_map<std::string, InferenceEngine::Parameter>&   config,
                                                           const bool                                                           needPerfCounters) :
    InferenceEngine::ExecutableNetworkThreadSafeDefault(
            nullptr,
            std::make_shared<InferenceEngine::ImmediateExecutor>()),
    _device{networkDevice},
    _network{networkForDevice},
    _config{config},
    _needPerfCounters{needPerfCounters} {
 }
 AutoBatchExecutableNetwork::~AutoBatchExecutableNetwork() {
 //    {
 //        std::lock_guard<std::mutex> lock(_mutex);
 //        _device = {};
 //    }
    _terminate = true;
    /* NOTE: The only threads that use `AutoBatchExecutableNetwork` Context are those that are used by Worker infer requests.
     *       But AsyncInferRequest destructor should waits for all asynchronous tasks that are used by the request
     */
    _workerRequests.clear();
 }
 InferenceEngine::InferRequestInternal::Ptr AutoBatchExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                                                InferenceEngine::OutputsDataMap networkOutputs) {
        // todo : guard request creation from another thread/on-the-fly
        auto num = _numRequestsCreated++;
        auto batch_id = num % _device.batchForDevice;
        if (!batch_id) {  //need new request
            _workerRequests.push_back(std::make_shared<WorkerInferRequest>());
            auto workerRequestPtr = _workerRequests.back();
            workerRequestPtr->_inferRequest = _network.CreateInferRequest();
            workerRequestPtr->_batchSize = _device.batchForDevice;
            workerRequestPtr->_cond = std::promise<void>();
            workerRequestPtr->_event = workerRequestPtr->_cond.get_future().share();
            // _idleWorkerRequests.push(workerRequestPtr);
            workerRequestPtr->_inferRequest.SetCompletionCallback<std::function<void(InferRequest, StatusCode)>>(
                [workerRequestPtr, this] (InferRequest , StatusCode status) mutable {
                    workerRequestPtr->_status = status;
                    auto signal = std::move(workerRequestPtr->_cond);
                    // reset the promise/future for next use
                    workerRequestPtr->_cond = std::promise<void>();
                    workerRequestPtr->_event = workerRequestPtr->_cond.get_future().share();
                    signal.set_value();
                });
       }
    return std::make_shared<AutoBatchInferRequest>(networkInputs, networkOutputs, _workerRequests.back().get(),
            batch_id, _device.batchForDevice, _needPerfCounters);
 }
 InferenceEngine::IInferRequest::Ptr AutoBatchExecutableNetwork::CreateInferRequest() {
    auto syncRequestImpl = CreateInferRequestImpl(_networkInputs, _networkOutputs);
    syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
    auto asyncTreadSafeImpl = std::make_shared<AutoBatchAsyncInferRequest>(std::static_pointer_cast<AutoBatchInferRequest>(syncRequestImpl),
                                                                             _needPerfCounters,
                                                                             std::static_pointer_cast<AutoBatchExecutableNetwork>(shared_from_this()),
                                                                             _callbackExecutor);
    IInferRequest::Ptr asyncRequest;
    asyncRequest.reset(new InferRequestBase(asyncTreadSafeImpl), [](IInferRequest* p) { p->Release(); });
    asyncTreadSafeImpl->SetPointerToPublicInterface(asyncRequest);
    return asyncRequest;
 }
 void AutoBatchExecutableNetwork::SetConfig(const std::map<std::string, InferenceEngine::Parameter> &config) {
    // TODO
    THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
 }
 InferenceEngine::Parameter AutoBatchExecutableNetwork::GetConfig(const std::string &name) const {
    auto res = _config.find(name);
    if (res != _config.end()) {
        return res->second;
    } else {
        THROW_IE_EXCEPTION << NOT_FOUND_str << name <<" not found in the ExecutableNetwork config";
    }
 }
 InferenceEngine::Parameter AutoBatchExecutableNetwork::GetMetric(const std::string &name) const {
    if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) {
        unsigned int res = 0u;
        try {
            res = _network.GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)).as<unsigned int>();
        } catch (const details::InferenceEngineException &iie) {
            THROW_IE_EXCEPTION
                    << "Every device used with the Auto-Batching should "
                    << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS ExecutableNetwork metric. "
                    << "Failed to query the metric for the "
                    << _network.GetMetric(METRIC_KEY(FULL_DEVICE_NAME)).as<std::string>()
                    << " with error:" << iie.what();
        }
        IE_SET_METRIC_RETURN(OPTIMAL_NUMBER_OF_INFER_REQUESTS, res * _device.batchForDevice);
    } else if (name == METRIC_KEY(NETWORK_NAME)) {
        IE_SET_METRIC_RETURN(NETWORK_NAME, _network.GetMetric(
                METRIC_KEY(NETWORK_NAME)).as<std::string>());
    } else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
        IE_SET_METRIC_RETURN(SUPPORTED_METRICS, {
            METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS),
            METRIC_KEY(SUPPORTED_METRICS),
            METRIC_KEY(NETWORK_NAME),
            METRIC_KEY(SUPPORTED_CONFIG_KEYS)
        });
    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
        std::vector<std::string> configKeys = { CONFIG_KEY(AUTO_BATCH) };
        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
    } else {
        THROW_IE_EXCEPTION << "Unsupported Network metric: " << name;
    }
 }
 // ------------------------------AutoBatchInferencePlugin----------------------------
 namespace {
 std::map<std::string, std::string> mergeConfigs(std::map<std::string, std::string> config,
                                                const std::map<std::string, std::string> & local) {
    for (auto && kvp : local) {
        config[kvp.first] = kvp.second;
    }
    return config;
 }
 }  // namespace
 std::map<std::string, std::string> AutoBatchInferencePlugin::GetSupportedConfig(
    const std::map<std::string, std::string> & config, const std::string & deviceName) const {
    std::vector<std::string> supportedConfigKeys = GetCore()->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
    std::map<std::string, std::string> supportedConfig;
    for (auto&& key : supportedConfigKeys) {
        auto itKey = config.find(key);
        if (config.end() != itKey) {
            supportedConfig[key] = itKey->second;
        }
    }
    return supportedConfig;
 }
 DeviceInformation AutoBatchInferencePlugin::ParseMetaDevice(const std::string& devicesBatchCfg,
                                                                          const std::map<std::string, std::string> & config) const {
    DeviceInformation metaDevice;
    auto getDeviceConfig = [&] (const DeviceName & deviceWithID) {
        DeviceIDParser deviceParser(deviceWithID);
        std::string deviceName = deviceParser.getDeviceName();
        std::map<std::string, std::string> tconfig = mergeConfigs(_config, config);
        // set device ID if any
        std::string deviceIDLocal = deviceParser.getDeviceID();
        if (!deviceIDLocal.empty()) {
            tconfig[PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal;
        }
        return GetSupportedConfig(tconfig, deviceName);
    };
    auto && d = devicesBatchCfg;
    {
        auto openingBracket = d.find_first_of('(');
        auto closingBracket = d.find_first_of(')', openingBracket);
        auto deviceName = d.substr(0, openingBracket);
        int batch = -1;
        if (closingBracket != std::string::npos && openingBracket < closingBracket) {
            batch = std::stol(d.substr(openingBracket + 1, closingBracket - 1));
            if (batch <= 0) {
                THROW_IE_EXCEPTION << "Batch value for '" << deviceName << "' must be > 0, while " << batch
                    << "is passed";
            }
        }
        // create meta device
        auto cfg = getDeviceConfig(deviceName);
        std::vector<std::string> supportedConfigKeys = GetCore()->GetMetric(deviceName, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
        if (std::find(std::begin(supportedConfigKeys), std::end(supportedConfigKeys), CONFIG_KEY_INTERNAL(AGGREGATED_PLUGIN))
            != std::end(supportedConfigKeys)) {
            cfg.emplace(CONFIG_KEY_INTERNAL(AGGREGATED_PLUGIN), "");
        }
        metaDevice = { deviceName, cfg, batch };
    }
    return metaDevice;
 }
 Parameter AutoBatchInferencePlugin::GetConfig(const std::string& name,
        const std::map<std::string, Parameter> & options) const {
    if (name == CONFIG_KEY(AUTO_BATCH)) {
        auto it = _config.find(CONFIG_KEY(AUTO_BATCH));
        if (it == _config.end()) {
            THROW_IE_EXCEPTION << "Value for KEY_AUTO_BATCH is not set";
        } else {
            return { it->second };
        }
    } else {
        THROW_IE_EXCEPTION << "Unsupported config key: " << name;
    }
 }
 void AutoBatchInferencePlugin::SetConfig(const std::map<std::string, std::string> & config) {
    for (auto && kvp : config) {
        _config[kvp.first] = kvp.second;
    }
 }
 static const Version version = {{2, 1}, CI_BUILD_NUMBER, "AutoBatchPlugin"};
 IE_DEFINE_PLUGIN_CREATE_FUNCTION(AutoBatchInferencePlugin, version)
 AutoBatchInferencePlugin::AutoBatchInferencePlugin() {
    _pluginName = "BATCH";
 }
 InferenceEngine::Parameter AutoBatchInferencePlugin::GetMetric(const std::string& name,
                                         const std::map<std::string, InferenceEngine::Parameter> & options) const {
    if (name == METRIC_KEY(SUPPORTED_METRICS)) {
        std::vector<std::string> metrics;
        metrics.push_back(METRIC_KEY(SUPPORTED_METRICS));
        metrics.push_back(METRIC_KEY(FULL_DEVICE_NAME));
        metrics.push_back(METRIC_KEY(SUPPORTED_CONFIG_KEYS));
        IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics);
    } else if (name == METRIC_KEY(FULL_DEVICE_NAME)) {
        std::string name = { "BATCH" };
        IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, name);
    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
        std::vector<std::string> configKeys = {
            CONFIG_KEY_INTERNAL(AGGREGATED_PLUGIN)};
        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
    } else {
        THROW_IE_EXCEPTION << "Unsupported metric key " << name;
    }
 }
 ExecutableNetworkInternal::Ptr AutoBatchInferencePlugin::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork&network,
                                                                              const std::map<std::string, std::string>& config) {
    if (GetCore() == nullptr) {
        THROW_IE_EXCEPTION << "Please, work with MULTI device via InferencEngine::Core object";
    }
    auto fullConfig = mergeConfigs(_config, config);
    auto device_batch = fullConfig.find(CONFIG_KEY(AUTO_BATCH));
    if (device_batch == fullConfig.end()) {
        THROW_IE_EXCEPTION << "KEY_AUTO_BATCH key is not set for BATCH device";
    }
    auto metaDevice = ParseMetaDevice(device_batch->second, fullConfig);
    // collect the settings that are applicable to the devices we are loading the network to
    std::unordered_map<std::string, InferenceEngine::Parameter> networkConfig;
    networkConfig.insert(*device_batch);
    ExecutableNetwork executableNetworkForDevice;
    auto & deviceName = metaDevice.deviceName;
    auto & deviceConfig = metaDevice.config;
    // network.serialize("out_orig.xml", "out_orig.bin");
    CNNNetwork clonedNetwork(InferenceEngine::cloneNetwork(network));
    const InputsDataMap inputInfo = clonedNetwork.getInputsInfo();
    ICNNNetwork::InputShapes shapes = clonedNetwork.getInputShapes();
    for (const InputsDataMap::value_type &item : inputInfo) {
        auto layout = item.second->getTensorDesc().getLayout();
        if (layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::NCDHW
                || layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC
                || layout == InferenceEngine::Layout::NDHWC) {
            shapes[item.first][0] = metaDevice.batchForDevice;
            std::cout << "  reshaping the input " << item.first << " (layout " << layout << ")" << " by the batch" << std::endl;
        }
    }
    std::cout << "Reshaped network by batch to  " << metaDevice.batchForDevice << std::endl;
    clonedNetwork.reshape(shapes);
    // clonedNetwork.serialize("out_batch4.xml", "out_batch4.bin");
    std::map<std::string, std::string> deviceConfig0 = deviceConfig;
    // deviceConfig0["DO_NOT_AUTO_BATCH"] = "TRUE";
    executableNetworkForDevice = GetCore()->LoadNetwork(CNNNetwork{clonedNetwork}, deviceName, deviceConfig0);
    networkConfig.insert(deviceConfig.begin(), deviceConfig.end());
    if ((std::shared_ptr<InferenceEngine::IExecutableNetwork>)executableNetworkForDevice == nullptr)
        THROW_IE_EXCEPTION << NOT_FOUND_str << "Failed to load Executable network the device "
                                            <<  "that the BATCH device is initialized to work with";
    auto perfConfig = fullConfig.find(PluginConfigParams::KEY_PERF_COUNT);
    bool enablePerfCounters = (fullConfig.end() != perfConfig) && (perfConfig->second == PluginConfigParams::YES);
    return std::make_shared<AutoBatchExecutableNetwork>(executableNetworkForDevice,
                                                          metaDevice,
                                                          networkConfig,
                                                          enablePerfCounters);
 }
 InferenceEngine::QueryNetworkResult AutoBatchInferencePlugin::QueryNetwork(const InferenceEngine::CNNNetwork& network,
                                              const std::map<std::string, std::string>& config) const {
 //    THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
    const std::map<std::string, std::string> cfg;
    return GetCore()->QueryNetwork(network, "CPU", cfg);
 }
 }  // namespace AutoBatchPlugin
--- a/inference-engine/src/auto_batch/auto_batch.hpp
+++ b/inference-engine/src/auto_batch/auto_batch.hpp
@ -0,0 +1,176 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma once
 #include <atomic>
 #include <mutex>
 #include <queue>
 #include <unordered_map>
 #include <map>
 #include <vector>
 #include <utility>
 #include <memory>
 #include <string>
 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
 #include <cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp>
 #include "ie_iinfer_request.hpp"
 #include "details/ie_exception_conversion.hpp"
 #include <ie_parallel.hpp>
 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
 # include <tbb/concurrent_queue.h>
 #endif
 namespace AutoBatchPlugin {
 using DeviceName = std::string;
 struct DeviceInformation {
    DeviceName deviceName;
    std::map<std::string, std::string> config;
    int batchForDevice;
 };
 #if ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO))
 template <typename T>
 using ThreadSafeQueue = tbb::concurrent_queue<T>;
 #else
 template <typename T>
 class ThreadSafeQueue {
 public:
    void push(T value) {
        std::lock_guard<std::mutex> lock(_mutex);
        _queue.push(std::move(value));
    }
    bool try_pop(T& value) {
        std::lock_guard<std::mutex> lock(_mutex);
        if (!_queue.empty()) {
            value = std::move(_queue.front());
            _queue.pop();
            return true;
        } else {
            return false;
        }
    }
    bool empty() {
        std::lock_guard<std::mutex> lock(_mutex);
        return _queue.empty();
    }
 protected:
    std::queue<T>   _queue;
    std::mutex      _mutex;
 };
 #endif
 class AutoBatchAsyncInferRequest;
 class AutoBatchExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafeDefault {
 public:
    using Ptr = std::shared_ptr<AutoBatchExecutableNetwork>;
    struct WorkerInferRequest {
        using Ptr = std::shared_ptr<WorkerInferRequest>;
        InferenceEngine::InferRequest   _inferRequest;
        InferenceEngine::StatusCode     _status = InferenceEngine::StatusCode::OK;
        int                             _batchSize;
        std::promise<void>              _cond;
        std::shared_future<void>        _event;
        std::atomic_int                 _numRequestsReady = {0};
        void ReportArrival() {
            _numRequestsReady++;
            if (_numRequestsReady == _batchSize) {
                _numRequestsReady = 0;
                _inferRequest.StartAsync();
            }
            // workerRequestPtr->_cond.
        }
    };
    using NotBusyWorkerRequests = ThreadSafeQueue<WorkerInferRequest*>;
    explicit AutoBatchExecutableNetwork(const InferenceEngine::ExecutableNetwork&                 networkForDevice,
                                          const DeviceInformation&                                 networkDevices,
                                          const std::unordered_map<std::string, InferenceEngine::Parameter>&    config,
                                          const bool                                                            needPerfCounters = false);
    void SetConfig(const std::map<std::string, InferenceEngine::Parameter> &config) override;
    InferenceEngine::Parameter GetConfig(const std::string &name) const override;
    InferenceEngine::Parameter GetMetric(const std::string &name) const override;
    InferenceEngine::IInferRequest::Ptr CreateInferRequest() override;
    InferenceEngine::InferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                      InferenceEngine::OutputsDataMap networkOutputs) override;
    ~AutoBatchExecutableNetwork() override;
    std::atomic_bool                                            _terminate = {false};
    DeviceInformation                                           _device;
    InferenceEngine::ExecutableNetwork                          _network;
    std::vector<WorkerInferRequest::Ptr>                        _workerRequests;
    std::unordered_map<std::string, InferenceEngine::Parameter> _config;
    bool                                                        _needPerfCounters = false;
    std::atomic_size_t                                          _numRequestsCreated = {0};
 };
 class AutoBatchInferRequest : public InferenceEngine::InferRequestInternal {
 public:
    using Ptr = std::shared_ptr<AutoBatchInferRequest>;
    explicit AutoBatchInferRequest(const InferenceEngine::InputsDataMap&  networkInputs,
                                   const InferenceEngine::OutputsDataMap& networkOutputs,
                                   AutoBatchExecutableNetwork::WorkerInferRequest* workerRequestPtr,
                                   int batch_id, int num_batch, bool _needPerfCounters = false);
    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> GetPerformanceCounts() const override;
    void InferImpl() override;
    // Batch-Device impl specific: sets the data (blobs from the device request to the batched device request)
    void SetBlobsToAnotherRequest(InferenceEngine::InferRequest& req);
    AutoBatchExecutableNetwork::WorkerInferRequest* _workerInferRequest;
 protected:
    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>  _perfMap;
    bool                                                                _needPerfCounters = false;
 };
 class AutoBatchAsyncInferRequest : public InferenceEngine::AsyncInferRequestThreadSafeDefault {
 public:
    using Ptr = std::shared_ptr<AutoBatchAsyncInferRequest>;
    explicit AutoBatchAsyncInferRequest(const AutoBatchInferRequest::Ptr&           inferRequest,
                                          const bool                                needPerfCounters,
                                          const AutoBatchExecutableNetwork::Ptr&      AutoBatchExecutableNetwork,
                                          const InferenceEngine::ITaskExecutor::Ptr&    callbackExecutor);
    void Infer_ThreadUnsafe() override;
    ~AutoBatchAsyncInferRequest() override;
 protected:
    AutoBatchExecutableNetwork::Ptr                                   _AutoBatchExecutableNetwork;
    AutoBatchInferRequest::Ptr                                        _inferRequest;
 };
 class AutoBatchInferencePlugin : public InferenceEngine::InferencePluginInternal {
 public:
    AutoBatchInferencePlugin();
    ~AutoBatchInferencePlugin() override = default;
    InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(const InferenceEngine::CNNNetwork& network,
                                                                       const std::map<std::string, std::string>& config) override;
    void SetConfig(const std::map<std::string, std::string>& config) override;
    InferenceEngine::Parameter GetConfig(const std::string& name,
                        const std::map<std::string, InferenceEngine::Parameter> & options) const override;
    InferenceEngine::QueryNetworkResult QueryNetwork(const InferenceEngine::CNNNetwork&       network,
                      const std::map<std::string, std::string>& config) const override;
    InferenceEngine::Parameter GetMetric(const std::string& name,
                                         const std::map<std::string, InferenceEngine::Parameter>& options) const override;
    DeviceInformation ParseMetaDevice(const std::string & devicesBatchCfg,
                                                  const std::map<std::string, std::string> & config) const;
 protected:
    std::map<std::string, std::string> GetSupportedConfig(const std::map<std::string, std::string>& config,
                                                          const DeviceName & deviceName) const;
 };
 }  // namespace AutoBatchPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -677,6 +677,12 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
              << static_cast<int>(device_info.gfx_ver.revision);
        }
        IE_SET_METRIC_RETURN(GPU_UARCH_VERSION, s.str());
    } else if (name == METRIC_KEY(OPTIMAL_BATCH)) {
        auto network = options.find("MODEL_ADDRESS")->second.as<InferenceEngine::CNNNetwork const*>();
        // auto transformedNetwork = CloneAndTransformNetwork(*network, _impl->m_config);
        unsigned int batch = 8;
        std::cout << "SELECTED BATCH: " << batch << std::endl;
        IE_SET_METRIC_RETURN(OPTIMAL_BATCH, batch);    
    } else if (name == METRIC_KEY(FULL_DEVICE_NAME)) {
        auto deviceName = StringRightTrim(device_info.dev_name, "NEO", false);
        deviceName += std::string(" (") + (device_info.dev_type == cldnn::device_type::discrete_gpu ? "dGPU" : "iGPU") + ")";
--- a/inference-engine/src/inference_engine/include/ie/ie_plugin_config.hpp
+++ b/inference-engine/src/inference_engine/include/ie/ie_plugin_config.hpp
@ -118,6 +118,7 @@ DECLARE_METRIC_VALUE(BATCHED_BLOB);
 * String value for metric name is "RANGE_FOR_STREAMS".
 */
 DECLARE_METRIC_KEY(RANGE_FOR_STREAMS, std::tuple<unsigned int, unsigned int>);
 DECLARE_METRIC_KEY(OPTIMAL_BATCH, unsigned int);
 /**
 * @brief Metric to provide a hint for a range for number of async infer requests. If device supports streams,
@ -250,6 +251,11 @@ DECLARE_CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS);
 DECLARE_CONFIG_VALUE(YES);
 DECLARE_CONFIG_VALUE(NO);
 /**
 * @brief Auto-batching to the `#batch`.
 */
 DECLARE_CONFIG_KEY(AUTO_BATCH);
 /**
 * @brief Limit `#threads` that are used by Inference Engine for inference on the CPU.
 */
@ -312,7 +318,6 @@ DECLARE_CONFIG_KEY(PERF_COUNT);
 * >0 - Direct value of limit. Batch size to process is min(new batch_limit, original_batch)
 */
 DECLARE_CONFIG_KEY(DYN_BATCH_LIMIT);
 /**
 * @brief The key checks whether dynamic batch is enabled.
 */
--- a/inference-engine/src/plugin_api/ie_icore.hpp
+++ b/inference-engine/src/plugin_api/ie_icore.hpp
@ -169,6 +169,7 @@ public:
    static std::vector<std::string> getHeteroDevices(std::string fallbackDevice);
    static std::vector<std::string> getMultiDevices(std::string devicesList);
    static std::string getBatchDevice(std::string devicesList);
 };
 }  // namespace InferenceEngine