Implemented inference in template plugin (#1308)

* Implemented inference in template plugin * Fixed tests * Removed thirdparty dependency * Simplified executor configuration * removed half * Fixed cmake * Fixed ngraph node check * device blob allocation * Fixed enum error
2020-07-28 17:25:31 +03:00
parent 2a96917e2a
commit 18836f53cd
18 changed files with 668 additions and 258 deletions
--- a/docs/template_plugin/include/template/template_config.hpp
+++ b/docs/template_plugin/include/template/template_config.hpp
@@ -45,14 +45,11 @@ namespace TemplateConfigParams {
 #define DECLARE_TEMPLATE_CONFIG_KEY(name) DECLARE_CONFIG_KEY(TEMPLATE_##name)
 #define DECLARE_TEMPLATE_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(TEMPLATE_##name)

+
 /**
- * @brief The key to define the type of transformations for TEMPLATE inputs and outputs.
- * TEMPLATE use custom data layout for input and output blobs. IE TEMPLATE Plugin provides custom
- * optimized version of transformation functions that do not use OpenMP and much more faster
- * than native TEMPLATE functions. Values: "NO" - optimized plugin transformations
- * are used, "YES" - native TEMPLATE transformations are used.
+ * @brief Defines the number of throutput streams used by TEMPLATE plugin.
 */
-DECLARE_TEMPLATE_CONFIG_KEY(ANY_CONFIG_KEY);
+DECLARE_TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS);


 }  // namespace TemplateConfigParams
--- a/docs/template_plugin/src/CMakeLists.txt
+++ b/docs/template_plugin/src/CMakeLists.txt
@@ -20,14 +20,25 @@ ie_add_plugin(NAME ${TARGET_NAME}
              VERSION_DEFINES_FOR template_plugin.cpp)

 target_include_directories(${TARGET_NAME} PRIVATE
-    "${CMAKE_CURRENT_SOURCE_DIR}"
+    "${CMAKE_CURRENT_SOURCE_DIR}")
+
+target_include_directories(${TARGET_NAME} PRIVATE
    "${IE_MAIN_TEMPLATE_PLUGIN_SOURCE_DIR}/include")

-target_link_libraries(${TARGET_NAME} PRIVATE IE::inference_engine IE::inference_engine_transformations ${NGRAPH_LIBRARIES} ${INTEL_ITT_LIBS})
+target_link_libraries(${TARGET_NAME} PRIVATE
+    IE::inference_engine
+    IE::inference_engine_transformations
+    ${INTEL_ITT_LIBS}
+    ${NGRAPH_LIBRARIES})
+
+# Link inference backend library to plugin. Here we use ngraph interpreter_backend as example
+target_link_libraries(${TARGET_NAME} PRIVATE
+    ngraph_backend
+    interpreter_backend)

 # ATTENTION: uncomment to register a plugin in the plugins.xml file
 # ie_register_plugins(MAIN_TARGET ${TARGET_NAME}
-                    # POSSIBLE_PLUGINS ${TARGET_NAME})
+#                     POSSIBLE_PLUGINS ${TARGET_NAME})
 # [cmake:plugin]

 # ATTENTION: uncomment to install component
--- a/docs/template_plugin/src/template_async_infer_request.cpp
+++ b/docs/template_plugin/src/template_async_infer_request.cpp
@@ -19,21 +19,28 @@ TemplateAsyncInferRequest::TemplateAsyncInferRequest(
    const InferenceEngine::ITaskExecutor::Ptr& callbackExecutor) :
    AsyncInferRequestThreadSafeDefault(inferRequest, cpuTaskExecutor, callbackExecutor),
    _inferRequest(inferRequest), _waitExecutor(waitExecutor) {
-    _pipeline = {
-        {cpuTaskExecutor, [this] {
-            IE_PROFILING_AUTO_SCOPE(PreprocessingAndStartPipeline)
-            _inferRequest->inferPreprocess();
-            _inferRequest->startPipeline();
-        }},
-        {_waitExecutor, [this] {
-            IE_PROFILING_AUTO_SCOPE(WaitPipeline)
-            _inferRequest->waitPipeline();
-        }},
-        {cpuTaskExecutor, [this] {
-            IE_PROFILING_AUTO_SCOPE(Postprocessing)
-            _inferRequest->inferPostprocess();
-        }}
-    };
+    constexpr const auto remoteDevice = false;
+    // By default single stage pipeline is created.
+    // This stage executes InferRequest::Infer() using cpuTaskExecutor.
+    // But if remote asynchronous device is used the pipeline can by splitted tasks that are executed by cpuTaskExecutor
+    // and waiting tasks. Waiting tasks can lock execution thread so they use separate threads from other executor.
+    if (remoteDevice) {
+        _pipeline = {
+            {cpuTaskExecutor, [this] {
+                IE_PROFILING_AUTO_SCOPE(PreprocessingAndStartPipeline)
+                _inferRequest->inferPreprocess();
+                _inferRequest->startPipeline();
+            }},
+            {_waitExecutor, [this] {
+                IE_PROFILING_AUTO_SCOPE(WaitPipeline)
+                _inferRequest->waitPipeline();
+            }},
+            {cpuTaskExecutor, [this] {
+                IE_PROFILING_AUTO_SCOPE(Postprocessing)
+                _inferRequest->inferPostprocess();
+            }}
+        };
+    }
 }
 // ! [async_infer_request:ctor]

--- a/docs/template_plugin/src/template_config.cpp
+++ b/docs/template_plugin/src/template_config.cpp
@@ -9,10 +9,12 @@

 #include <ie_util_internal.hpp>
 #include <ie_plugin_config.hpp>
+#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
 #include <file_utils.h>
 #include <cpp_interfaces/exception2status.hpp>

 #include "template_config.hpp"
+#include "template/template_config.hpp"

 using namespace TemplatePlugin;

@@ -20,12 +22,22 @@ Configuration::Configuration() { }

 Configuration::Configuration(const ConfigMap& config, const Configuration & defaultCfg, bool throwOnUnsupported) {
    *this = defaultCfg;
+    // If plugin needs to use InferenceEngine::StreamsExecutor it should be able to process its configuration
+    auto streamExecutorConfigKeys = _streamsExecutorConfig.SupportedKeys();
    for (auto&& c : config) {
        const auto& key = c.first;
        const auto& value = c.second;

-        if (CONFIG_KEY(DEVICE_ID) == key) {
+        if (TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS) == key) {
+            _streamsExecutorConfig.SetConfig(CONFIG_KEY(CPU_THROUGHPUT_STREAMS), value);
+        } else if (streamExecutorConfigKeys.end() !=
+            std::find(std::begin(streamExecutorConfigKeys), std::end(streamExecutorConfigKeys), key)) {
+            _streamsExecutorConfig.SetConfig(key, value);
+        } else if (CONFIG_KEY(DEVICE_ID) == key) {
            deviceId = std::stoi(value);
+            if (deviceId > 0) {
+                THROW_IE_EXCEPTION << "Device ID " << deviceId << " is not supported";
+            }
        } else if (CONFIG_KEY(PERF_COUNT) == key) {
            perfCount = (CONFIG_VALUE(YES) == value);
        } else if (throwOnUnsupported) {
@@ -39,6 +51,14 @@ InferenceEngine::Parameter Configuration::Get(const std::string& name) const {
        return {std::to_string(deviceId)};
    } else if (name == CONFIG_KEY(PERF_COUNT)) {
        return {perfCount};
+    } else if (name == TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS) || name == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) {
+        return {std::to_string(_streamsExecutorConfig._streams)};
+    } else if (name == CONFIG_KEY(CPU_BIND_THREAD)) {
+        return const_cast<InferenceEngine::IStreamsExecutor::Config&>(_streamsExecutorConfig).GetConfig(name);
+    } else if (name == CONFIG_KEY(CPU_THREADS_NUM)) {
+        return {std::to_string(_streamsExecutorConfig._threads)};
+    } else if (name == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) {
+        return {std::to_string(_streamsExecutorConfig._threadsPerStream)};
    } else {
        THROW_IE_EXCEPTION << NOT_FOUND_str << ": " << name;
    }
--- a/docs/template_plugin/src/template_config.hpp
+++ b/docs/template_plugin/src/template_config.hpp
@@ -11,6 +11,8 @@

 #include <ie_parameter.hpp>

+#include <threading/ie_istreams_executor.hpp>
+
 namespace TemplatePlugin {

 template<typename T>
@@ -34,6 +36,7 @@ struct Configuration {

    int deviceId                = 0;
    bool perfCount              = true;
+    InferenceEngine::IStreamsExecutor::Config _streamsExecutorConfig;
 };
 // ! [configuration:header]

--- a/docs/template_plugin/src/template_executable_network.cpp
+++ b/docs/template_plugin/src/template_executable_network.cpp
@@ -16,40 +16,31 @@
 #include <threading/ie_executor_manager.hpp>
 #include <details/ie_cnn_network_tools.h>

-#include <ngraph/ngraph.hpp>
-
-#include <transformations/common_optimizations/common_optimizations.hpp>
-
+#include "template/template_config.hpp"
 #include "template_plugin.hpp"
 #include "template_executable_network.hpp"
-#include "template_pattern_transformation.hpp"

 using namespace TemplatePlugin;

 // ! [executable_network:ctor_cnnnetwork]
-TemplatePlugin::ExecutableNetwork::ExecutableNetwork(InferenceEngine::ICNNNetwork&  network,
-                                                     const Configuration&           cfg):
-    _name(network.getName()),
+TemplatePlugin::ExecutableNetwork::ExecutableNetwork(const std::shared_ptr<ngraph::Function>&   function,
+                                                     const Configuration&                       cfg,
+                                                     const Plugin::Ptr&                         plugin) :
+    InferenceEngine::ExecutableNetworkThreadSafeDefault(nullptr, nullptr), // Disable default threads creation
    _cfg(cfg),
-    _waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getExecutor("Template")) {
+    _plugin(plugin),
+    _function(function) {
    // TODO: if your plugin supports device ID (more that single instance of device can be on host machine)
    // you should select proper device based on KEY_DEVICE_ID or automatic behavior
    // In this case, _waitExecutor should also be created per device.
-
    try {
-        if (std::shared_ptr<const ngraph::Function> ngraphFunction = network.getFunction()) {
-            CompileGraph(ngraphFunction);
-        } else {
-            THROW_IE_EXCEPTION << "TEMPLATE plugin can compile only IR v10 networks";
-        }
-    }
-    catch (const InferenceEngineException & e) {
-        throw e;
-    }
-    catch (const std::exception & e) {
+        CompileGraph();
+        InitExecutor();
+    } catch (const InferenceEngineException&) {
+        throw;
+    } catch (const std::exception & e) {
        THROW_IE_EXCEPTION << "Standard exception from compilation library: " << e.what();
-    }
-    catch (...) {
+    } catch (...) {
        THROW_IE_EXCEPTION << "Generic exception is thrown";
    }
 }
@@ -57,53 +48,53 @@ TemplatePlugin::ExecutableNetwork::ExecutableNetwork(InferenceEngine::ICNNNetwor

 // ! [executable_network:ctor_import_stream]
 TemplatePlugin::ExecutableNetwork::ExecutableNetwork(std::istream &                 model,
-                                                     const Configuration&           cfg) :
-                  _cfg(cfg) {
+                                                     const Configuration&           cfg,
+                                                     const Plugin::Ptr&             plugin) :
+    _cfg(cfg),
+    _plugin(plugin) {
    // TODO: since Import network is not a mandatory functionality, this ctor can just be removed
 }
 // ! [executable_network:ctor_import_stream]

 // ! [executable_network:compile_graph]
-void TemplatePlugin::ExecutableNetwork::CompileGraph(const std::shared_ptr<const ngraph::Function> & ngraphFunction) {
+void TemplatePlugin::ExecutableNetwork::CompileGraph() {
    // TODO: perform actual graph compilation taking `_cfg` into account

-    // 1.Copy ngraph::Function first to apply some transformations later in
-    // ExecutableNetwork::CompileGraph, which modify original ngraph::Function
-    const bool shareConsts = false, constFolding = false;
-    std::vector<::ngraph::element::Type> new_types;
-    std::vector<::ngraph::PartialShape> new_shapes;
-
-    for (const auto &parameter : ngraphFunction->get_parameters()) {
-        new_shapes.emplace_back(parameter->get_partial_shape());
-        new_types.emplace_back(parameter->get_element_type());
+    // Generate backend specific blob mappings. For example Inference Engine uses not ngraph::Result nodes friendly name
+    // as inference request output names but the name of the layer before.
+    for (auto&& result : _function->get_results()) {
+        auto previousOutput = result->get_input_source_output(0);
+        auto outputName = previousOutput.get_node()->get_friendly_name();
+        if (previousOutput.get_node()->get_output_size() > 1) {
+            outputName += '.' + std::to_string(previousOutput.get_index());
+        }
+        _outputIndex.emplace(outputName, _function->get_result_index(result));
+    }
+    for (auto&& parameter : _function->get_parameters()) {
+        _inputIndex.emplace(parameter->get_friendly_name(), _function->get_parameter_index(parameter));
    }

-    auto copyFunction = ngraph::specialize_function(std::const_pointer_cast<ngraph::Function>(ngraphFunction),
-        new_types, new_shapes, std::vector<void *>(new_types.size(), nullptr), constFolding, shareConsts);
-
-    // 2. Perform common optimizations and device-specific transformations
-    ngraph::pass::Manager passManager;
-    // Example: register CommonOptimizations transformation from transformations library
-    passManager.register_pass<ngraph::pass::CommonOptimizations>();
-    // Example: register plugin specific transformation
-    passManager.register_pass<ngraph::pass::DecomposeDivideMatcher>();
-    passManager.register_pass<ngraph::pass::ReluReluFusionMatcher>();
-    // Register any other transformations
-    // ..
-
-    // After `run_passes`, we have the transformed function, where operations match device operations,
-    // and we can create device hardware-dependent graph
-    passManager.run_passes(copyFunction);
-
-    // 3. Iterate over operations and create hardware-specific ngraph
-    for (const auto& op : copyFunction->get_ordered_ops()) {
-        // TODO: map ngraph `op` to device operation
-    }
-
-    // 4. Perform any other steps like allocation and filling device buffers, and so on
+    // Perform any other steps like allocation and filling device buffers, and so on
 }
 // ! [executable_network:compile_graph]

+// ! [executable_network:init_executor]
+void TemplatePlugin::ExecutableNetwork::InitExecutor() {
+    // Default mutlitthreaded configuration is balanced for throughtput and latency cases and takes into account
+    // real hardware cores and NUMA nodes.
+    auto streamsExecutorConfig = InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(_cfg._streamsExecutorConfig);
+    streamsExecutorConfig._name = "TemplateStreamsExecutor";
+    // As Inference Engine CPU Streams Executor creates some additional therads
+    // it is better to avoid threads recreateion as some OSs memory allocator can not manage such usage cases
+    // and memory consumption can be larger than it is expected.
+    // So Inference Engone provides executors cache.
+    _taskExecutor = ExecutorManager::getInstance()->getIdleCPUStreamsExecutor(streamsExecutorConfig);
+    // NOTE: callback Executor is not configured. So callback will be called in the thread of tha last stage of inference request pipeline
+    // _callbackExecutor = ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({"TemplateCallbackExecutor"});
+}
+// ! [executable_network:init_executor]
+
+
 // ! [executable_network:create_infer_request_impl]
 InferenceEngine::InferRequestInternal::Ptr TemplatePlugin::ExecutableNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                                                     InferenceEngine::OutputsDataMap networkOutputs) {
@@ -115,7 +106,7 @@ InferenceEngine::InferRequestInternal::Ptr TemplatePlugin::ExecutableNetwork::Cr
 void TemplatePlugin::ExecutableNetwork::CreateInferRequest(IInferRequest::Ptr& asyncRequest) {
    auto internalRequest = CreateInferRequestImpl(_networkInputs, _networkOutputs);
    auto asyncThreadSafeImpl = std::make_shared<TemplateAsyncInferRequest>(std::static_pointer_cast<TemplateInferRequest>(internalRequest),
-                                                                           _taskExecutor, _waitExecutor, _callbackExecutor);
+                                                                           _taskExecutor, _plugin->_waitExecutor, _callbackExecutor);
    asyncRequest.reset(new InferenceEngine::InferRequestBase<TemplateAsyncInferRequest>(asyncThreadSafeImpl),
                       [](InferenceEngine::IInferRequest *p) { p->Release(); });
    asyncThreadSafeImpl->SetPointerToPublicInterface(asyncRequest);
@@ -124,13 +115,7 @@ void TemplatePlugin::ExecutableNetwork::CreateInferRequest(IInferRequest::Ptr& a

 // ! [executable_network:get_config]
 void TemplatePlugin::ExecutableNetwork::GetConfig(const std::string &name, Parameter &result, ResponseDesc *resp) const {
-    // TODO: return more supported values for config keys
-    if (name == CONFIG_KEY(DEVICE_ID) ||
-        name == CONFIG_KEY(PERF_COUNT)) {
-        result = _cfg.Get(name);
-    } else {
-        THROW_IE_EXCEPTION << "Unsupported ExecutableNetwork config key: " << name;
-    }
+    result = _cfg.Get(name);
 }
 // ! [executable_network:get_config]

@@ -144,14 +129,20 @@ void TemplatePlugin::ExecutableNetwork::GetMetric(const std::string &name, Infer
            METRIC_KEY(SUPPORTED_CONFIG_KEYS),
            METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)});
    } else if (METRIC_KEY(SUPPORTED_CONFIG_KEYS) == name) {
-        result = IE_SET_METRIC(SUPPORTED_CONFIG_KEYS, std::vector<std::string>{
+        std::vector<std::string> configKeys = {
            CONFIG_KEY(DEVICE_ID),
-            CONFIG_KEY(PERF_COUNT)});
+            CONFIG_KEY(PERF_COUNT),
+            TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS) };
+        auto streamExecutorConfigKeys = IStreamsExecutor::Config{}.SupportedKeys();
+        for (auto&& configKey : streamExecutorConfigKeys) {
+            configKeys.emplace_back(configKey);
+        }
+        result = IE_SET_METRIC(SUPPORTED_CONFIG_KEYS, configKeys);
    } else if (METRIC_KEY(NETWORK_NAME) == name) {
-        result = IE_SET_METRIC(NETWORK_NAME, _name);
+        auto networkName = _function->get_friendly_name();
+        result = IE_SET_METRIC(NETWORK_NAME, networkName);
    } else if (METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS) == name) {
-        // TODO: fill with actual number
-        unsigned int value = 1;
+        unsigned int value = _cfg._streamsExecutorConfig._streams;
        result = IE_SET_METRIC(OPTIMAL_NUMBER_OF_INFER_REQUESTS, value);
    } else {
        THROW_IE_EXCEPTION << "Unsupported ExecutableNetwork metric: " << name;
--- a/docs/template_plugin/src/template_executable_network.hpp
+++ b/docs/template_plugin/src/template_executable_network.hpp
@@ -27,7 +27,7 @@

 namespace TemplatePlugin {

-class Engine;
+class Plugin;

 /**
 * @class ExecutableNetwork
@@ -36,11 +36,13 @@ class Engine;
 // ! [executable_network:header]
 class ExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafeDefault {
 public:
-    ExecutableNetwork(InferenceEngine::ICNNNetwork&  network,
-                      const Configuration&           cfg);
+    ExecutableNetwork(const std::shared_ptr<ngraph::Function>&  function,
+                      const Configuration&                      cfg,
+                      const std::shared_ptr<Plugin>&            plugin);

-    ExecutableNetwork(std::istream &                 model,
-                      const Configuration&           cfg);
+    ExecutableNetwork(std::istream&                  model,
+                      const Configuration&           cfg,
+                      const std::shared_ptr<Plugin>& plugin);

    ~ExecutableNetwork() override = default;

@@ -53,15 +55,18 @@ public:
    void GetMetric(const std::string &name, InferenceEngine::Parameter &result, InferenceEngine::ResponseDesc *resp) const override;
    void GetConfig(const std::string &name, InferenceEngine::Parameter &result, InferenceEngine::ResponseDesc *resp) const override;

-    std::atomic<std::size_t>                    _requestId = {0};
-    std::string                                 _name;
-    Configuration                               _cfg;
-
 private:
-    void CompileGraph(const std::shared_ptr<const ngraph::Function> & ngraphFunction);
+    friend class TemplateInferRequest;

-    std::shared_ptr<Engine>                     _plugin;
-    InferenceEngine::ITaskExecutor::Ptr         _waitExecutor;
+    void CompileGraph();
+    void InitExecutor();
+
+    std::atomic<std::size_t>                    _requestId = {0};
+    Configuration                               _cfg;
+    std::shared_ptr<Plugin>                     _plugin;
+    std::shared_ptr<ngraph::Function>           _function;
+    std::map<std::string, std::size_t>          _inputIndex;
+    std::map<std::string, std::size_t>          _outputIndex;
 };
 // ! [executable_network:header]

--- a/docs/template_plugin/src/template_infer_request.cpp
+++ b/docs/template_plugin/src/template_infer_request.cpp
@@ -18,17 +18,16 @@
 #include <ie_parallel.hpp>
 #include <ie_memcpy.h>
 #include <precision_utils.h>
-#include <template/template_config.hpp>

+#include "template/template_config.hpp"
 #include "template_infer_request.hpp"
 #include "template_executable_network.hpp"
 #include "template_plugin.hpp"

 using namespace TemplatePlugin;
+using namespace InferenceEngine;

 using Time = std::chrono::high_resolution_clock;
-using ns = std::chrono::nanoseconds;
-using fsec = std::chrono::duration<float>;

 // ! [infer_request:ctor]
 TemplateInferRequest::TemplateInferRequest(const InferenceEngine::InputsDataMap&                     networkInputs,
@@ -38,10 +37,9 @@ TemplateInferRequest::TemplateInferRequest(const InferenceEngine::InputsDataMap&
    _executableNetwork(executableNetwork) {
    // TODO: allocate infer request device and host buffers if needed, fill actual list of profiling tasks

-    auto requestID = std::to_string(_executableNetwork->_requestId);
-    _executableNetwork->_requestId++;
+    auto requestID = std::to_string(_executableNetwork->_requestId.fetch_add(1));

-    std::string name = _executableNetwork->_name + "_Req" + requestID;
+    std::string name = _executableNetwork->_function->get_friendly_name() + "_Req" + requestID;
    _profilingTask = { {
        { ProfilingTask("Template" + std::to_string(_executableNetwork->_cfg.deviceId) + "_" + name + "_Preprocess") },
        { ProfilingTask("Template" + std::to_string(_executableNetwork->_cfg.deviceId) + "_" + name + "_Postprocess") },
@@ -49,9 +47,12 @@ TemplateInferRequest::TemplateInferRequest(const InferenceEngine::InputsDataMap&
        { ProfilingTask("Template" + std::to_string(_executableNetwork->_cfg.deviceId) + "_" + name + "_WaitPipline") },
    } };

+    _executable = _executableNetwork->_plugin->_backend->compile(_executableNetwork->_function);
+    _parameters = _executableNetwork->_function->get_parameters();
+    _results = _executableNetwork->_function->get_results();
+
    allocateDeviceBuffers();
-    allocateInputBlobs();
-    allocateOutputBlobs();
+    allocateBlobs();
 }
 // ! [infer_request:ctor]

@@ -62,92 +63,66 @@ TemplateInferRequest::~TemplateInferRequest() {
 // ! [infer_request:dtor]

 void TemplateInferRequest::allocateDeviceBuffers() {
-    // TODO: allocate device buffers if Template device is a remote one
+    // Allocate plugin backend specific memory handles
+    _inputTensors.resize(_networkInputs.size());
+    _outputTensors.resize(_networkOutputs.size());
 }

-void TemplateInferRequest::allocateInputBlobs() {
-    for (auto &networkInput : _networkInputs) {
-        SizeVector dims = networkInput.second->getTensorDesc().getDims();
-        Precision precision = networkInput.second->getTensorDesc().getPrecision();
-        Layout input_layout = networkInput.second->getInputData()->getLayout();
-        Blob::Ptr inputBlob;
-        Blob::Ptr inputBlobNCHW;
+template<typename BlobDataMap, typename GetNetworkPrecisionF>
+static void AllocateImpl(const BlobDataMap& blobDataMap,
+                         BlobMap& blobMap,
+                         BlobMap& networkBlobMap,
+                         GetNetworkPrecisionF&& GetNetworkPrecision) {
+    for (auto&& blobData : blobDataMap) {
+        auto& dims = blobData.second->getTensorDesc().getDims();
+        auto& precision = blobData.second->getTensorDesc().getPrecision();
+        auto layout = blobData.second->getTensorDesc().getLayout();
+        Blob::Ptr blob;
        switch (precision) {
-        case Precision::FP32 :
-            inputBlobNCHW = inputBlob = InferenceEngine::make_shared_blob<float>({ precision, dims, input_layout });
-            if (input_layout == Layout::NHWC) {
-                inputBlobNCHW = InferenceEngine::make_shared_blob<float>({ precision, dims, Layout::NCHW });
-            }
-            break;
-        case Precision::FP16 :
-        case Precision::I16 :
-            inputBlobNCHW = inputBlob = InferenceEngine::make_shared_blob<int16_t>({ precision, dims, input_layout });
-            if (input_layout == Layout::NHWC) {
-                inputBlobNCHW = InferenceEngine::make_shared_blob<int16_t>({ precision, dims, Layout::NCHW });
-            }
-            break;
-        case Precision::U8 :
-            inputBlobNCHW = inputBlob = InferenceEngine::make_shared_blob<uint8_t>({ precision, dims, input_layout });
-            if (input_layout == Layout::NHWC) {
-                inputBlobNCHW = InferenceEngine::make_shared_blob<uint8_t>({ precision, dims, Layout::NCHW });
-            }
-            break;
-        default:
-            THROW_IE_EXCEPTION << "Unsupported network precision: " << precision
-                << precision << "! Supported precisions are: FP32, FP16, I16, U8";
+        case Precision::U8: {
+            blob = InferenceEngine::make_shared_blob<std::uint8_t>({precision, dims, layout});
+        } break;
+        case Precision::FP32 : {
+            blob = InferenceEngine::make_shared_blob<float>({precision, dims, layout});
+        } break;
+        default: THROW_IE_EXCEPTION << "Template Plugin: Unsupported Input/Output Presision";
        }
-        // allocate the input blob
-        inputBlob->allocate();
-        _inputs[networkInput.first] = inputBlob;
-        if (inputBlobNCHW != inputBlob) {
-            inputBlobNCHW->allocate();
+        blob->allocate();
+        blobMap[blobData.first] = blob;
+
+        auto networkPresion = GetNetworkPrecision(blobData.first);
+        Blob::Ptr networkBlob;
+        switch (networkPresion) {
+        case ngraph::element::Type_t::f32 : {
+            if (precision == Precision::FP32) {
+                networkBlob = blob;
+            } else {
+                networkBlob = InferenceEngine::make_shared_blob<float>({Precision::FP32, dims, layout});
+            }
+        } break;
+        default: THROW_IE_EXCEPTION << "Template Plugin: Unsupported network Input/Output Presision";
        }
-        _inputsNCHW[networkInput.first] = inputBlobNCHW;
+        if (blob != networkBlob) {
+            networkBlob->allocate();
+        }
+        networkBlobMap[blobData.first] = networkBlob;
    }
 }

-void TemplateInferRequest::allocateOutputBlobs() {
-    for (auto &networkOutput : _networkOutputs) {
-        SizeVector dims = networkOutput.second->getTensorDesc().getDims();
-        Precision precision = networkOutput.second->getPrecision();
-        Blob::Ptr outputBlob;
-
-        // allocate the output blob
-        Blob::Ptr outputBlobNCHW;
-        switch (precision) {
-        case Precision::FP32 :
-            outputBlobNCHW = outputBlob = InferenceEngine::make_shared_blob<float>({ precision, dims, networkOutput.second->getLayout() });
-            if (networkOutput.second->getLayout() == Layout::NHWC) {
-                outputBlobNCHW = InferenceEngine::make_shared_blob<float>({ precision, dims,  Layout::NCHW });
-            }
-            break;
-        case Precision::FP16 :
-            outputBlobNCHW = outputBlob = InferenceEngine::make_shared_blob<int16_t>({ precision, dims, networkOutput.second->getLayout() });
-            if (networkOutput.second->getLayout() == Layout::NHWC) {
-                outputBlobNCHW = InferenceEngine::make_shared_blob<int16_t>({ precision, dims, Layout::NCHW });
-            }
-            break;
-        default:
-            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Unsupported output precision: "
-                << precision << "! Supported precisions are: FP32, FP16";
-        }
-        // allocate the output blob
-        outputBlob->allocate();
-        _outputs[networkOutput.first] = outputBlob;
-        if (outputBlobNCHW != outputBlob) {
-            outputBlobNCHW->allocate();
-        }
-        _outputsNCHW[networkOutput.first] = outputBlobNCHW;
-    }
-
-    if (_networkOutputs.empty() || _networkInputs.empty()) {
-        THROW_IE_EXCEPTION << "Internal error: no information about network's output/input";
-    }
+void TemplateInferRequest::allocateBlobs() {
+    auto&& parameters = _executableNetwork->_function->get_parameters();
+    AllocateImpl(_networkInputs, _inputs, _networkInputBlobs, [&] (const std::string& blobName) {
+        return parameters.at(_executableNetwork->_inputIndex.at(blobName))->get_element_type();
+    });
+    auto&& results = _executableNetwork->_function->get_results();
+    AllocateImpl(_networkOutputs, _outputs, _networkOutputBlobs, [&] (const std::string& blobName) {
+        return results.at(_executableNetwork->_outputIndex.at(blobName))->get_element_type();
+    });
 }

 // ! [infer_request:infer_impl]
 void TemplateInferRequest::InferImpl() {
-    // TODO: fill with actual list of pipeline stages, which are executed syncronously for sync infer requests
+    // TODO: fill with actual list of pipeline stages, which are executed synchronously for sync infer requests
    inferPreprocess();
    startPipeline();
    waitPipeline();
@@ -155,50 +130,109 @@ void TemplateInferRequest::InferImpl() {
 }
 // ! [infer_request:infer_impl]

-// ! [infer_request:infer_preprocess]
-void TemplateInferRequest::inferPreprocess() {
-    auto prev = Time::now();
+template<typename SrcT, typename DstT>
+static void blobCopy(const Blob::Ptr& src, const Blob::Ptr& dst) {
+    std::copy_n(InferenceEngine::as<InferenceEngine::MemoryBlob>(src)->rmap().as<const SrcT*>(),
+                src->size(),
+                InferenceEngine::as<InferenceEngine::MemoryBlob>(dst)->wmap().as<DstT*>());
+}

-    // execute input pre-processing.
-    InferRequestInternal::execDataPreprocessing(_inputs);
-
-    for (auto &input : InferRequestInternal::_inputs) {
-        auto& src = input.second;
-        auto& dst = _inputsNCHW[input.first];
-        if (src != dst) {
-            if (src->getTensorDesc().getPrecision() == dst->getTensorDesc().getPrecision()
-                && src->getTensorDesc().getDims() == dst->getTensorDesc().getDims()
-                && src->getTensorDesc().getLayout() == dst->getTensorDesc().getLayout()) {
-                _inputsNCHW[input.first] = input.second;
-            } else {  // Convert Layout to NCHW
-                InferenceEngine::blob_copy(src, dst);
+static void blobCopy(const Blob::Ptr& src, const Blob::Ptr& dst) {
+    switch (src->getTensorDesc().getPrecision()) {
+        case Precision::U8 : {
+            switch (dst->getTensorDesc().getPrecision()) {
+                case Precision::U8 : break;
+                case Precision::FP32 : {
+                    blobCopy<std::uint8_t, float>(src, dst);
+                } break;
+                default : {
+                    THROW_IE_EXCEPTION << "Unsupported precision conversion from "
+                        << src->getTensorDesc().getPrecision() <<" to " << dst->getTensorDesc().getPrecision();
+                }
            }
+        } break;
+        case Precision::FP32 : {
+            switch (dst->getTensorDesc().getPrecision()) {
+                case Precision::FP32 : break;
+                case Precision::U8 : {
+                    blobCopy<float, std::uint8_t>(src, dst);
+                } break;
+                default : {
+                    THROW_IE_EXCEPTION << "Unsupported precision conversion from "
+                        << src->getTensorDesc().getPrecision() <<" to " << dst->getTensorDesc().getPrecision();
+                }
+            }
+        } break;
+        default : {
+            THROW_IE_EXCEPTION << "Unsupported precision conversion from " << src->getTensorDesc().getPrecision();
        }
    }
+}

-    // TODO: Preprocessing on inputs if needed: work _inputsNCHW
-
-    _inputPreprocessTime = static_cast<double>(std::chrono::duration_cast<ns>(Time::now() - prev).count());
+// ! [infer_request:infer_preprocess]
+void TemplateInferRequest::inferPreprocess() {
+    IE_PROFILING_AUTO_SCOPE_TASK(_profilingTask[Preprocess]);
+    auto start = Time::now();
+    // NOTE: After InferRequestInternal::execDataPreprocessing call
+    //       input can points to other memory region than it was allocated in constructor.
+    InferRequestInternal::execDataPreprocessing(_inputs);
+    for (auto&& input : _inputs) {
+        auto inputBlob = input.second;
+        auto networkInput = _networkInputBlobs[input.first];
+        if (inputBlob->getTensorDesc().getPrecision() == networkInput->getTensorDesc().getPrecision()) {
+            networkInput = inputBlob;
+        } else {
+            blobCopy(inputBlob, networkInput);
+        }
+        auto index = _executableNetwork->_inputIndex[input.first];
+        const auto& parameter = _parameters[index];
+        const auto& parameterShape = parameter->get_shape();
+        const auto& parameterType = parameter->get_element_type();
+        _inputTensors[index] = _executableNetwork->_plugin->_backend->create_tensor(parameterType, parameterShape,
+            InferenceEngine::as<InferenceEngine::MemoryBlob>(networkInput)->rmap().as<void*>());
+    }
+    for (auto&& output : _outputs) {
+        auto outputBlob = output.second;
+        auto networkOutput = _networkOutputBlobs[output.first];
+        auto index = _executableNetwork->_outputIndex[output.first];
+        if (outputBlob->getTensorDesc().getPrecision() == networkOutput->getTensorDesc().getPrecision()) {
+            networkOutput = outputBlob;
+        }
+        const auto& result = _results[index];
+        const auto& resultShape = result->get_shape();
+        const auto& resultType = result->get_element_type();
+        _outputTensors[index] = _executableNetwork->_plugin->_backend->create_tensor(resultType, resultShape,
+            InferenceEngine::as<InferenceEngine::MemoryBlob>(networkOutput)->wmap().as<void*>());
+    }
+    _durations[Preprocess] = Time::now() - start;
 }
 // ! [infer_request:infer_preprocess]

 void TemplateInferRequest::startPipeline() {
    IE_PROFILING_AUTO_SCOPE_TASK(_profilingTask[StartPipeline])
-    // TODO: Start pipeline and fill _inputTransferTime, _executeTime, _outputTransferTime
+    auto start = Time::now();
+    _executable->call(_outputTensors, _inputTensors);
+    _durations[StartPipeline] = Time::now() - start;
 }

 void TemplateInferRequest::waitPipeline() {
    IE_PROFILING_AUTO_SCOPE_TASK(_profilingTask[WaitPipeline])
-    auto prev = Time::now();
-    // TODO: Wait pipeline using driver API or other synronizations methods
-    _inputPreprocessTime = static_cast<double>(std::chrono::duration_cast<ns>(Time::now() - prev).count());
+    auto start = Time::now();
+    // TODO: Wait pipeline using driver API or other synchronizations methods
+    _durations[WaitPipeline] = Time::now() - start;
 }

 void TemplateInferRequest::inferPostprocess() {
-    IE_PROFILING_AUTO_SCOPE_TASK(_profilingTask[Postprocess])
-    auto prev = Time::now();
-    // TODO: perform post-processing and convert to NHWC layout
-    _outputPostProcessTime = static_cast<double>(std::chrono::duration_cast<ns>(Time::now() - prev).count());
+    IE_PROFILING_AUTO_SCOPE_TASK(_profilingTask[Postprocess]);
+    auto start = Time::now();
+    for (auto&& output : _outputs) {
+        auto outputBlob = output.second;
+        auto networkOutput = _networkOutputBlobs[output.first];
+        if (outputBlob->getTensorDesc().getPrecision() != networkOutput->getTensorDesc().getPrecision()) {
+            blobCopy(networkOutput, outputBlob);
+        }
+    }
+    _durations[Postprocess] = Time::now() - start;
 }

 // ! [infer_request:get_performance_counts]
@@ -206,18 +240,19 @@ void TemplateInferRequest::GetPerformanceCounts(std::map<std::string, InferenceE
    InferenceEngineProfileInfo info;
    info.execution_index = 0;
    info.status = InferenceEngineProfileInfo::EXECUTED;
-    info.cpu_uSec = info.realTime_uSec = _inputPreprocessTime / 1000;
+    info.cpu_uSec = info.realTime_uSec = _durations[Preprocess].count();
    perfMap["1. input preprocessing"] = info;
    info.cpu_uSec = 0;
-    info.realTime_uSec = _inputTransferTime / 1000;
+    info.realTime_uSec = 0;
    perfMap["2. input transfer to a device"] = info;
    info.cpu_uSec = 0;
-    info.realTime_uSec = _executeTime / 1000;
+    info.status = InferenceEngineProfileInfo::EXECUTED;
+    info.cpu_uSec = info.realTime_uSec = _durations[StartPipeline].count();
    perfMap["3. execution time"] = info;
    info.cpu_uSec = 0;
-    info.realTime_uSec = _outputTransferTime / 1000;
+    info.realTime_uSec = 0;
    perfMap["4. output transfer from a device"] = info;
-    info.cpu_uSec = info.realTime_uSec = _outputPostProcessTime / 1000;
+    info.cpu_uSec = info.realTime_uSec = _durations[Postprocess].count();
    perfMap["5. output postprocessing"] = info;
 }
 // ! [infer_request:get_performance_counts]
--- a/docs/template_plugin/src/template_infer_request.hpp
+++ b/docs/template_plugin/src/template_infer_request.hpp
@@ -17,8 +17,13 @@
 #include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
 #include <threading/ie_itask_executor.hpp>

+#include <ngraph/runtime/tensor.hpp>
+#include <ngraph/runtime/tensor.hpp>
+#include <executable.hpp>
+
 #include "template_config.hpp"

+
 namespace TemplatePlugin {

 class ExecutableNetwork;
@@ -46,8 +51,7 @@ public:

 private:
    void allocateDeviceBuffers();
-    void allocateInputBlobs();
-    void allocateOutputBlobs();
+    void allocateBlobs();

    enum {
        Preprocess,
@@ -57,17 +61,18 @@ private:
        numOfStages
    };

-    std::array<InferenceEngine::ProfilingTask, numOfStages> _profilingTask;
+    std::array<InferenceEngine::ProfilingTask, numOfStages>             _profilingTask;
+    // for performance counters
+    std::array<std::chrono::duration<float, std::micro>, numOfStages>   _durations;

-    InferenceEngine::BlobMap                                _inputsNCHW;
-    InferenceEngine::BlobMap                                _outputsNCHW;
+    InferenceEngine::BlobMap                                _networkInputBlobs;
+    InferenceEngine::BlobMap                                _networkOutputBlobs;
+    ngraph::ParameterVector                                 _parameters;
+    ngraph::ResultVector                                    _results;

-    // for performance counts
-    double                                                  _inputPreprocessTime   = 0.0;
-    double                                                  _inputTransferTime     = 0.0;
-    double                                                  _executeTime           = 0.0;
-    double                                                  _outputTransferTime    = 0.0;
-    double                                                  _outputPostProcessTime = 0.0;
+    std::vector<std::shared_ptr<ngraph::runtime::Tensor>>   _inputTensors;
+    std::vector<std::shared_ptr<ngraph::runtime::Tensor>>   _outputTensors;
+    std::shared_ptr<ngraph::runtime::Executable>            _executable;
 };
 // ! [infer_request:header]

--- a/docs/template_plugin/src/template_plugin.cpp
+++ b/docs/template_plugin/src/template_plugin.cpp
@@ -24,11 +24,17 @@
 #include <ie_input_info.hpp>
 #include <ie_layouts.h>
 #include <hetero/hetero_plugin_config.hpp>
-#include <template/template_config.hpp>
-
+#include <backend.hpp>
+#include <ngraph/specialize_function.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <transformations/common_optimizations/common_optimizations.hpp>
+#include <transformations/rt_info/fused_names_attribute.hpp>
+#include "template/template_config.hpp"
 #include "template_plugin.hpp"
 #include "template_executable_network.hpp"
 #include "template_infer_request.hpp"
+#include "template_pattern_transformation.hpp"

 using namespace TemplatePlugin;

@@ -36,9 +42,61 @@ using namespace TemplatePlugin;
 Plugin::Plugin() {
    // TODO: fill with actual device name
    _pluginName = "TEMPLATE";
+    ngraph::runtime::Backend::set_backend_shared_library_search_directory("");
+    _backend = ngraph::runtime::Backend::create("INTERPRETER");
+    _waitExecutor = ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({"TemplateWaitExecutor"});
 }
 // ! [plugin:ctor]

+// ! [plugin:dtor]
+Plugin::~Plugin() {
+    // Plugin should remove executors from executor cache to avoid threads number growth in the whole application
+    ExecutorManager::getInstance()->clear("TemplateStreamsExecutor");
+    ExecutorManager::getInstance()->clear("TemplateWaitExecutor");
+    // NOTE: Uncomment this if Inference Engine Executor cache is used to create callback executor
+    // ExecutorManager::getInstance()->clear("TemplateCallbackExecutor");
+}
+// ! [plugin:dtor]
+
+// ! [plugin:transform]
+std::shared_ptr<ngraph::Function> Plugin::Transform(const std::shared_ptr<const ngraph::Function>& function) {
+    // 1.Copy ngraph::Function first to apply some transformations which modify original ngraph::Function
+    const bool shareConsts = false, constFolding = false;
+    std::vector<::ngraph::element::Type> new_types;
+    std::vector<::ngraph::PartialShape> new_shapes;
+
+    for (const auto &parameter : function->get_parameters()) {
+        new_shapes.emplace_back(parameter->get_partial_shape());
+        new_types.emplace_back(parameter->get_element_type());
+    }
+
+    auto copyFunction = ngraph::specialize_function(std::const_pointer_cast<ngraph::Function>(function),
+        new_types, new_shapes, std::vector<void *>(new_types.size(), nullptr), constFolding, shareConsts);
+
+    copyFunction->set_friendly_name(function->get_friendly_name());
+
+    // 2. Perform common optimizations and device-specific transformations
+    ngraph::pass::Manager passManager;
+    // Example: register CommonOptimizations transformation from transformations library
+    passManager.register_pass<ngraph::pass::CommonOptimizations>();
+    // Example: register plugin specific transformation
+    passManager.register_pass<ngraph::pass::DecomposeDivideMatcher>();
+    passManager.register_pass<ngraph::pass::ReluReluFusionMatcher>();
+    // Register any other transformations
+    // ..
+
+    // After `run_passes`, we have the transformed function, where operations match device operations,
+    // and we can create device hardware-dependent graph
+    passManager.run_passes(copyFunction);
+
+    // 3. Iterate over operations and create hardware-specific ngraph
+    for (const auto& op : copyFunction->get_ordered_ops()) {
+        // TODO: map ngraph `op` to device operation
+    }
+    return copyFunction;
+}
+// ! [plugin:transform]
+
 // ! [plugin:load_exe_network_impl]
 InferenceEngine::ExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork & network,
                                                                           const ConfigMap &config) {
@@ -72,9 +130,12 @@ InferenceEngine::ExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const
        }
    }

-    auto clonedNetwork = cloneNet(network);
+    auto function = network.getFunction();
+    if (function == nullptr) {
+        THROW_IE_EXCEPTION << "TEMPLATE plugin can compile only IR v10 networks";
+    }

-    return std::make_shared<ExecutableNetwork>(*clonedNetwork, cfg);
+    return std::make_shared<ExecutableNetwork>(Transform(function), cfg, std::static_pointer_cast<Plugin>(shared_from_this()));
 }
 // ! [plugin:load_exe_network_impl]

@@ -90,7 +151,7 @@ InferenceEngine::ExecutableNetwork Plugin::ImportNetworkImpl(std::istream& model
    auto cfg = Configuration(config, exportedCfg);

    IExecutableNetwork::Ptr executableNetwork;
-    auto exec_network_impl = std::make_shared<ExecutableNetwork>(model, cfg);
+    auto exec_network_impl = std::make_shared<ExecutableNetwork>(model, cfg, std::static_pointer_cast<Plugin>(shared_from_this()));
    executableNetwork.reset(new ExecutableNetworkBase<ExecutableNetworkInternal>(exec_network_impl),
                            [](InferenceEngine::details::IRelease *p) {p->Release(); });

@@ -101,19 +162,42 @@ InferenceEngine::ExecutableNetwork Plugin::ImportNetworkImpl(std::istream& model
 // ! [plugin:query_network]
 void Plugin::QueryNetwork(const ICNNNetwork &network, const ConfigMap& config, QueryNetworkResult &res) const {
    Configuration cfg{config, _cfg, false};
-    res.rc = StatusCode::OK;
-
-    if (std::shared_ptr<const ngraph::Function> ngraphFunction = network.getFunction()) {
-        auto ops = ngraphFunction->get_ordered_ops();
-        for (auto&& op : ops) {
-            // TODO: investigate if an op is actually supported by Template device
-            bool supported = true;
-            if (supported) {
-                res.supportedLayersMap.insert({ op->get_friendly_name(), GetName() });
+    auto function = network.getFunction();
+    if (function == nullptr) {
+         THROW_IE_EXCEPTION << "Template Plugin supports only ngraph cnn network representation";
+    }
+    // First of all we should store initial input operation set
+    std::unordered_set<std::string> originalOps;
+    for (auto&& node : function->get_ops()) {
+        originalOps.emplace(node->get_friendly_name());
+    }
+    // It is needed to apply all transformations as it is done in LoadExeNetworkImpl
+    auto transformedFunction = Transform(function);
+    // The same input node can be transformed into supported and unsupported backend node
+    // So we need store as supported ether unsupported node sets
+    std::unordered_set<std::string> supported;
+    std::unordered_set<std::string> unsupported;
+    auto opset = ngraph::get_opset4();
+    for (auto&& node : transformedFunction->get_ops()) {
+        if (!ngraph::op::is_constant(node) && !ngraph::op::is_parameter(node) && !ngraph::op::is_output(node)) {
+            // Extract transformation history from transformed node as list of nodes
+            for (auto&& fusedLayerName : ngraph::getFusedNamesVector(node)) {
+                // Filter just nodes from original operation set
+                if (contains(originalOps, fusedLayerName)) {
+                    if (opset.contains_type_insensitive(fusedLayerName)) {
+                        supported.emplace(fusedLayerName);
+                    } else {
+                        unsupported.emplace(fusedLayerName);
+                    }
+                }
            }
        }
-    } else {
-        THROW_IE_EXCEPTION << "TEMPLATE plugin can query only IR v10 networks";
+    }
+    // The result set should contains just nodes from supported set
+    for (auto&& layerName : supported) {
+        if (!contains(unsupported, layerName)) {
+            res.supportedLayersMap.emplace(layerName, GetName());
+        }
    }
 }
 // ! [plugin:query_network]
@@ -148,10 +232,17 @@ InferenceEngine::Parameter Plugin::GetMetric(const std::string& name, const std:
            METRIC_KEY(RANGE_FOR_ASYNC_INFER_REQUESTS) };
        IE_SET_METRIC_RETURN(SUPPORTED_METRICS, supportedMetrics);
    } else if (METRIC_KEY(SUPPORTED_CONFIG_KEYS) == name) {
-        std::vector<std::string> confiKeys = {
+        std::vector<std::string> configKeys = {
            CONFIG_KEY(DEVICE_ID),
-            CONFIG_KEY(PERF_COUNT) };
-        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, confiKeys);
+            CONFIG_KEY(PERF_COUNT),
+            TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS)};
+        auto streamExecutorConfigKeys = IStreamsExecutor::Config{}.SupportedKeys();
+        for (auto&& configKey : streamExecutorConfigKeys) {
+            if (configKey != InferenceEngine::PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) {
+                configKeys.emplace_back(configKey);
+            }
+        }
+        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
    } else if (METRIC_KEY(AVAILABLE_DEVICES) == name) {
        // TODO: fill list of available devices
        std::vector<std::string> availableDevices = { "" };
@@ -161,7 +252,7 @@ InferenceEngine::Parameter Plugin::GetMetric(const std::string& name, const std:
        IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, name);
    } else if (METRIC_KEY(OPTIMIZATION_CAPABILITIES) == name) {
        // TODO: fill actual list of supported capabilities: e.g. Template device supports only FP32
-        std::vector<std::string> capabilities = { METRIC_VALUE(FP32), TEMPLATE_METRIC_VALUE(HARDWARE_CONVOLUTION) };
+        std::vector<std::string> capabilities = { METRIC_VALUE(FP32) /*, TEMPLATE_METRIC_VALUE(HARDWARE_CONVOLUTION)*/ };
        IE_SET_METRIC_RETURN(OPTIMIZATION_CAPABILITIES, capabilities);
    } else if (METRIC_KEY(RANGE_FOR_ASYNC_INFER_REQUESTS) == name) {
        // TODO: fill with actual values
--- a/docs/template_plugin/src/template_plugin.hpp
+++ b/docs/template_plugin/src/template_plugin.hpp
@@ -17,6 +17,8 @@
 #include "template_executable_network.hpp"
 #include "template_config.hpp"

+#include "backend.hpp"
+
 //! [plugin:header]
 namespace TemplatePlugin {

@@ -25,7 +27,7 @@ public:
    using Ptr = std::shared_ptr<Plugin>;

    Plugin();
-    ~Plugin() override = default;
+    ~Plugin() override;

    void SetConfig(const std::map<std::string, std::string> &config) override;
    void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
@@ -40,7 +42,14 @@ public:
    InferenceEngine::ExecutableNetwork ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) override;

 private:
-    Configuration                    _cfg;
+    friend class ExecutableNetwork;
+    friend class TemplateInferRequest;
+
+    static std::shared_ptr<ngraph::Function> Transform(const std::shared_ptr<const ngraph::Function>& function);
+
+    Configuration                               _cfg;
+    std::shared_ptr<ngraph::runtime::Backend>   _backend;
+    InferenceEngine::ITaskExecutor::Ptr         _waitExecutor;
 };

 }  // namespace TemplatePlugin
--- a/docs/template_plugin/tests/functional/CMakeLists.txt
+++ b/docs/template_plugin/tests/functional/CMakeLists.txt
@@ -16,3 +16,5 @@ addIeTargetTest(
        LABELS
            TEMPLATE
 )
+
+target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
--- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/config.cpp
+++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/config.cpp
@@ -5,6 +5,7 @@
 #include "multi-device/multi_device_config.hpp"

 #include "behavior/config.hpp"
+#include <template/template_config.hpp>

 using namespace BehaviorTestsDefinitions;
 namespace {
@@ -14,14 +15,20 @@ namespace {
    };

    const std::vector<std::map<std::string, std::string>> configs = {
-            {}
+            {{TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS), InferenceEngine::PluginConfigParams::CPU_THROUGHPUT_AUTO}},
+            {{TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS), InferenceEngine::PluginConfigParams::CPU_THROUGHPUT_NUMA}},
+            {{TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS), "8"}},
+    };
+
+    const std::vector<std::map<std::string, std::string>> inconfigs = {
+            {{TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS), "OFF"}},
    };

    INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, IncorrectConfigTests,
                            ::testing::Combine(
                                    ::testing::ValuesIn(netPrecisions),
                                    ::testing::Values("TEMPLATE"),
-                                    ::testing::ValuesIn(configs)),
+                                    ::testing::ValuesIn(inconfigs)),
                            IncorrectConfigTests::getTestCaseName);


@@ -29,7 +36,7 @@ namespace {
                            ::testing::Combine(
                                    ::testing::ValuesIn(netPrecisions),
                                    ::testing::Values("TEMPLATE"),
-                                    ::testing::ValuesIn(configs)),
+                                    ::testing::ValuesIn(inconfigs)),
                            IncorrectConfigAPITests::getTestCaseName);


--- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/convolution.cpp
+++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/convolution.cpp
@@ -0,0 +1,115 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/convolution.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+};
+
+/* ============= 2D Convolution ============= */
+const std::vector<std::vector<size_t >> kernels = {{3, 3},
+                                                          {3, 5}};
+const std::vector<std::vector<size_t >> strides = {{1, 1},
+                                                          {1, 3}};
+const std::vector<std::vector<ptrdiff_t>> padBegins = {{0, 0},
+                                                       {0, 3}};
+const std::vector<std::vector<ptrdiff_t>> padEnds = {{0, 0},
+                                                     {0, 3}};
+const std::vector<std::vector<size_t >> dilations = {{1, 1},
+                                                            {3, 1}};
+const std::vector<size_t> numOutChannels = {1, 5};
+const std::vector<ngraph::op::PadType> padTypes = {
+        ngraph::op::PadType::EXPLICIT,
+        ngraph::op::PadType::VALID
+};
+
+const auto conv2DParams_ExplicitPadding = ::testing::Combine(
+        ::testing::ValuesIn(kernels),
+        ::testing::ValuesIn(strides),
+        ::testing::ValuesIn(padBegins),
+        ::testing::ValuesIn(padEnds),
+        ::testing::ValuesIn(dilations),
+        ::testing::ValuesIn(numOutChannels),
+        ::testing::Values(ngraph::op::PadType::EXPLICIT)
+);
+const auto conv2DParams_AutoPadValid = ::testing::Combine(
+        ::testing::ValuesIn(kernels),
+        ::testing::ValuesIn(strides),
+        ::testing::Values(std::vector<ptrdiff_t>({0, 0})),
+        ::testing::Values(std::vector<ptrdiff_t>({0, 0})),
+        ::testing::ValuesIn(dilations),
+        ::testing::ValuesIn(numOutChannels),
+        ::testing::Values(ngraph::op::PadType::VALID)
+);
+
+INSTANTIATE_TEST_CASE_P(Convolution2D_ExplicitPadding, ConvolutionLayerTest,
+                        ::testing::Combine(
+                                conv2DParams_ExplicitPadding,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(std::vector<size_t >({1, 3, 30, 30})),
+                                ::testing::Values("TEMPLATE")),
+                        ConvolutionLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(Convolution2D_AutoPadValid, ConvolutionLayerTest,
+                        ::testing::Combine(
+                                conv2DParams_AutoPadValid,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(std::vector<size_t >({1, 3, 30, 30})),
+                                ::testing::Values("TEMPLATE")),
+                        ConvolutionLayerTest::getTestCaseName);
+/* ============= 3D Convolution ============= */
+const std::vector<std::vector<size_t >> kernels3d = {{3, 3, 3},
+                                                            {3, 5, 3}};
+const std::vector<std::vector<ptrdiff_t>> paddings3d = {{0, 0, 0},
+                                                        {0, 2, 0}};
+
+const std::vector<std::vector<size_t >> strides3d = {{1, 1, 1},
+                                                            {1, 2, 1}};
+const std::vector<std::vector<size_t >> dilations3d = {{1, 1, 1},
+                                                              {1, 2, 1}};
+
+const auto conv3DParams_ExplicitPadding = ::testing::Combine(
+        ::testing::ValuesIn(kernels3d),
+        ::testing::ValuesIn(strides3d),
+        ::testing::ValuesIn(paddings3d),
+        ::testing::ValuesIn(paddings3d),
+        ::testing::ValuesIn(dilations3d),
+        ::testing::Values(5),
+        ::testing::Values(ngraph::op::PadType::EXPLICIT)
+);
+const auto conv3DParams_AutoPadValid = ::testing::Combine(
+        ::testing::ValuesIn(kernels3d),
+        ::testing::ValuesIn(strides3d),
+        ::testing::Values(std::vector<ptrdiff_t>({0, 0, 0})),
+        ::testing::Values(std::vector<ptrdiff_t>({0, 0, 0})),
+        ::testing::ValuesIn(dilations3d),
+        ::testing::Values(5),
+        ::testing::Values(ngraph::op::PadType::VALID)
+);
+
+INSTANTIATE_TEST_CASE_P(Convolution3D_ExplicitPadding, ConvolutionLayerTest,
+                        ::testing::Combine(
+                                conv3DParams_ExplicitPadding,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(std::vector<size_t >({1, 3, 10, 10, 10})),
+                                ::testing::Values("TEMPLATE")),
+                        ConvolutionLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(Convolution3D_AutoPadValid, ConvolutionLayerTest,
+                        ::testing::Combine(
+                                conv3DParams_AutoPadValid,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(std::vector<size_t >({1, 3, 10, 10, 10})),
+                                ::testing::Values("TEMPLATE")),
+                        ConvolutionLayerTest::getTestCaseName);
+
+}  // namespace
--- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp
+++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/reshape.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+};
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheckDynBatch, ReshapeLayerTest,
+        ::testing::Combine(
+                ::testing::Values(true),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+                ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+                ::testing::Values("TEMPLATE"),
+                ::testing::Values(std::map<std::string, std::string>({}))),
+                ReshapeLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheck, ReshapeLayerTest,
+        ::testing::Combine(
+                ::testing::Values(true),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
+                ::testing::Values(std::vector<size_t>({10, 0, 100})),
+                ::testing::Values("TEMPLATE"),
+                ::testing::Values(std::map<std::string, std::string>({}))),
+                ReshapeLayerTest::getTestCaseName);
+}  // namespace
--- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp
+++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+};
+
+const std::vector<InferenceEngine::Layout> inputLayouts2D = {
+    InferenceEngine::Layout::NC,
+};
+
+const std::vector<InferenceEngine::SizeVector> inputShapes2D = {
+    InferenceEngine::SizeVector {1, 100},
+    InferenceEngine::SizeVector {100, 1},
+    InferenceEngine::SizeVector {10, 10},
+};
+
+const std::vector<size_t> axis2D = {
+    0, 1
+};
+
+const auto params2D = testing::Combine(
+    testing::ValuesIn(netPrecisions),
+    testing::ValuesIn(inputLayouts2D),
+    testing::ValuesIn(inputShapes2D),
+    testing::ValuesIn(axis2D),
+    testing::Values("TEMPLATE"),
+    testing::Values(std::map<std::string, std::string>())
+);
+
+INSTANTIATE_TEST_CASE_P(
+        SoftMax2D,
+        SoftMaxLayerTest,
+        params2D,
+        SoftMaxLayerTest::getTestCaseName
+);
+
+const std::vector<InferenceEngine::SizeVector> inputShapes4D = {
+    InferenceEngine::SizeVector {1, 100, 1, 1},
+    InferenceEngine::SizeVector {1, 3, 4, 3},
+    InferenceEngine::SizeVector {2, 3, 4, 5},
+};
+
+const std::vector<size_t> axis4D = {0, 1, 2, 3};
+
+const auto params4D = testing::Combine(
+    testing::ValuesIn(netPrecisions),
+    testing::Values(InferenceEngine::Layout::NCHW),
+    testing::ValuesIn(inputShapes4D),
+    testing::ValuesIn(axis4D),
+    testing::Values("TEMPLATE"),
+    testing::Values(std::map<std::string, std::string>())
+);
+
+INSTANTIATE_TEST_CASE_P(
+        SoftMax4D,
+        SoftMaxLayerTest,
+        params4D,
+        SoftMaxLayerTest::getTestCaseName
+);
+
+}  // namespace
--- a/docs/template_plugin/tests/functional/skip_tests_config.cpp
+++ b/docs/template_plugin/tests/functional/skip_tests_config.cpp
@@ -9,5 +9,9 @@

 std::vector<std::string> disabledTestPatterns() {
    return {
+        ".*ExclusiveAsyncRequests.*",
+        ".*reusableCPUStreamsExecutor.*",
+        ".*registerPlugin.*",
+        ".*IEClassGetAvailableDevices.*"
    };
 }
--- a/docs/template_plugin/tests/functional/transformations/template_transformations_test.cpp
+++ b/docs/template_plugin/tests/functional/transformations/template_transformations_test.cpp
@@ -18,7 +18,7 @@
 using namespace testing;

 // ! [transformation:test]
-TEST(TransformationTests, TemplateTest) {
+TEST(TransformationTests, DISABLED_TemplateTest) {
    std::shared_ptr<ngraph::Function> f, f_ref;
    // f - ngraph::Function for applying transformation
    // f_ref - ngraph::Function that is expected after applying transformation