Merge remote-tracking branch 'upstream/master' into add_mxnet_operations

2021-12-20 12:11:14 +03:00 · 2021-12-20 12:11:14 +03:00 · fe4e714c76
commit fe4e714c76
parent 74df3b3970 37ad512d98
545 changed files with 6722 additions and 2553 deletions
--- a/.ci/azure/linux.yml
+++ b/.ci/azure/linux.yml
@ -241,7 +241,7 @@ jobs:
      . $(SETUPVARS) -pyver 3.8 && python3 -m pytest -s $(INSTALL_DIR)/tests/mo/unit_tests --junitxml=TEST-ModelOptimizer.xml
    displayName: 'Model Optimizer UT'
    continueOnError: false
-    enabled: false
+    enabled: true

  - script: . $(SETUPVARS) && $(INSTALL_TEST_DIR)/ov_core_unit_tests --gtest_print_time=1 --gtest_filter=-*IE_GPU* --gtest_output=xml:TEST-NGraphUT.xml
    workingDirectory: $(INSTALL_TEST_DIR)
@ -334,7 +334,7 @@ jobs:
    displayName: 'Samples Smoke Tests'
    continueOnError: false
    condition: eq(variables['CMAKE_BUILD_SHARED_LIBS'], 'ON')
-    enabled: false
+    enabled: true

  - script: |
      export DATA_PATH=$(MODELS_PATH)
@ -353,7 +353,7 @@ jobs:
    workingDirectory: $(LAYER_TESTS_DIR)
    displayName: 'Layer Tests'
    continueOnError: false
-    enabled: false
+    enabled: true

  - task: PublishTestResults@2
    condition: always()
--- a/.ci/azure/linux_lohika.yml
+++ b/.ci/azure/linux_lohika.yml
@ -35,7 +35,7 @@ jobs:

  - checkout: none

-  - script: git -C ~/work/openvino checkout -m --recurse-submodules $(Build.SourceVersion)
+  - script: git -C ~/work/openvino checkout -m $(Build.SourceVersion) && git -C ~/work/openvino submodule update --init --recursive
    displayName: checkout

    # Should be after 'Install dependencies' because Git lfs is not installed
@ -71,7 +71,7 @@ jobs:
      ./buildreleasenolto.sh
      libinference_engine_preproc.so
      MKLDNNPlugin
-      clDNNPlugin
+      ov_intel_gpu_plugin
      clDNN_unit_tests64
      gpuFuncTests
    displayName: Build Lin
--- a/.gitmodules
+++ b/.gitmodules
@ -57,7 +57,7 @@
 	path = thirdparty/onednn_gpu
 	url = https://github.com/oneapi-src/oneDNN.git
 [submodule "tools/pot/thirdparty/open_model_zoo"]
-	path = tools/pot/thirdparty/open_model_zoo
+	path = thirdparty/open_model_zoo
 	url = https://github.com/openvinotoolkit/open_model_zoo.git
 [submodule "thirdparty/json/nlohmann_json"]
 	path = thirdparty/json/nlohmann_json
--- a/2
+++ b/2
@ -66,7 +66,7 @@ Jenkinsfile  @openvinotoolkit/openvino-admins
 /src/inference/include/ie/gna/  @openvinotoolkit/openvino-ie-gna-maintainers

 # IE MULTI:
-/inference-engine/src/multi_device/  @openvinotoolkit/openvino-ie-multi-maintainers
+/src/plugins/auto/  @openvinotoolkit/openvino-ie-multi-maintainers
 /src/inference/include/ie/multi-device/  @openvinotoolkit/openvino-ie-multi-maintainers

 # IE Tests:
--- a/cmake/developer_package/api_validator/api_validator.cmake
+++ b/cmake/developer_package/api_validator/api_validator.cmake
@ -79,8 +79,20 @@ function(_ie_add_api_validator_post_build_step)
    _ie_add_api_validator_post_build_step_recursive(TARGET ${API_VALIDATOR_TARGET})

    # remove targets which were tested before
-
-    foreach(item IN LISTS VALIDATED_LIBRARIES)
+    foreach(target IN LISTS API_VALIDATOR_TARGETS)
+        list(FIND VALIDATED_LIBRARIES ${target} index)
+        if (NOT index EQUAL -1)
+            list(APPEND VALIDATED_TARGETS ${target})
+        endif()
+        if(TARGET "${target}")
+            get_target_property(orig_target ${target} ALIASED_TARGET)
+            list(FIND VALIDATED_LIBRARIES ${orig_target} index)
+            if (NOT index EQUAL -1)
+                list(APPEND VALIDATED_TARGETS ${target})
+            endif()
+        endif()
+    endforeach()
+    foreach(item IN LISTS VALIDATED_TARGETS)
        list(REMOVE_ITEM API_VALIDATOR_TARGETS ${item})
    endforeach()

--- a/cmake/developer_package/frontends/frontends.cmake
+++ b/cmake/developer_package/frontends/frontends.cmake
@ -3,7 +3,8 @@
 #

 set(FRONTEND_INSTALL_INCLUDE "runtime/include/")
-set(FRONTEND_NAME_SUFFIX "_ov_frontend")
+set(FRONTEND_NAME_PREFIX "ov_")
+set(FRONTEND_NAME_SUFFIX "_frontend")

 set(FRONTEND_NAMES "" CACHE INTERNAL "")

@ -20,7 +21,7 @@ function(ov_target_link_frontends TARGET_NAME)
    endif()

    foreach(name IN LISTS FRONTEND_NAMES)
-        set(frontend_target_name "${name}${FRONTEND_NAME_SUFFIX}")
+        set(frontend_target_name "${FRONTEND_NAME_PREFIX}${name}${FRONTEND_NAME_SUFFIX}")
        target_link_libraries(${TARGET_NAME} PRIVATE ${frontend_target_name})
    endforeach()
 endfunction()
@ -99,7 +100,7 @@ macro(ov_add_frontend)
        endif()
    endforeach()

-    set(TARGET_NAME "${OV_FRONTEND_NAME}${FRONTEND_NAME_SUFFIX}")
+    set(TARGET_NAME "${FRONTEND_NAME_PREFIX}${OV_FRONTEND_NAME}${FRONTEND_NAME_SUFFIX}")

    list(APPEND FRONTEND_NAMES ${OV_FRONTEND_NAME})
    set(FRONTEND_NAMES "${FRONTEND_NAMES}" CACHE INTERNAL "" FORCE)
--- a/cmake/developer_package/plugins/plugins.cmake
+++ b/cmake/developer_package/plugins/plugins.cmake
@ -117,20 +117,20 @@ function(ie_add_plugin)
        # fake dependencies to build in the following order:
        # IE -> IE readers -> IE inference plugins -> IE-based apps
        if(BUILD_SHARED_LIBS)
-            if(TARGET ir_ov_frontend)
-                add_dependencies(${IE_PLUGIN_NAME} ir_ov_frontend)
+            if(TARGET ov_ir_frontend)
+                add_dependencies(${IE_PLUGIN_NAME} ov_ir_frontend)
            endif()
            if(TARGET inference_engine_ir_v7_reader)
                add_dependencies(${IE_PLUGIN_NAME} inference_engine_ir_v7_reader)
            endif()
-            if(TARGET onnx_ov_frontend)
-                add_dependencies(${IE_PLUGIN_NAME} onnx_ov_frontend)
+            if(TARGET ov_onnx_frontend)
+                add_dependencies(${IE_PLUGIN_NAME} ov_onnx_frontend)
            endif()
-            if(TARGET paddlepaddle_ov_frontend)
-                add_dependencies(${IE_PLUGIN_NAME} paddlepaddle_ov_frontend)
+            if(TARGET ov_paddlepaddle_frontend)
+                add_dependencies(${IE_PLUGIN_NAME} ov_paddlepaddle_frontend)
            endif()
-            if(TARGET tensorflow_ov_frontend)
-                add_dependencies(${IE_PLUGIN_NAME} tensorflow_ov_frontend)
+            if(TARGET ov_tensorflow_frontend)
+                add_dependencies(${IE_PLUGIN_NAME} ov_tensorflow_frontend)
            endif()
        endif()

--- a/cmake/templates/ngraphConfig.cmake.in
+++ b/cmake/templates/ngraphConfig.cmake.in
@ -28,11 +28,11 @@
 #
 #   ngraph::common                         - nGraph frontend common
 #
-#   ngraph_onnx_ov_frontend_FOUND          - True if the system has onnx_ov_frontend library
-#   ngraph::onnx_ov_frontend               - ONNX FrontEnd target (optional)
+#   ngraph_ov_onnx_frontend_FOUND          - True if the system has ov_onnx_frontend library
+#   ngraph::ov_onnx_frontend               - ONNX FrontEnd target (optional)
 #
 #   ngraph_paddlepaddle_frontend_FOUND     - True if the system has PDPD frontend
-#   ngraph::paddlepaddle_ov_frontend       - nGraph PDPD frontend (optional)
+#   ngraph::ov_paddlepaddle_frontend       - nGraph PDPD frontend (optional)
 #

@PACKAGE_INIT@
@ -58,38 +58,38 @@ if(TARGET openvino::frontend::common AND NOT TARGET ngraph::frontend_common)
        INTERFACE_LINK_LIBRARIES openvino::frontend::common)
 endif()

-if(TARGET openvino::frontend::onnx AND NOT TARGET ngraph::onnx_ov_frontend)
-    add_library(ngraph::onnx_ov_frontend INTERFACE IMPORTED)
-    set_target_properties(ngraph::onnx_ov_frontend PROPERTIES
+if(TARGET openvino::frontend::onnx AND NOT TARGET ngraph::ov_onnx_frontend)
+    add_library(ngraph::ov_onnx_frontend INTERFACE IMPORTED)
+    set_target_properties(ngraph::ov_onnx_frontend PROPERTIES
        INTERFACE_LINK_LIBRARIES openvino::frontend::onnx)
 endif()

-if(TARGET openvino::frontend::paddlepaddle AND NOT TARGET ngraph::paddlepaddle_ov_frontend)
-    add_library(ngraph::paddlepaddle_ov_frontend INTERFACE IMPORTED)
-    set_target_properties(ngraph::paddlepaddle_ov_frontend PROPERTIES
+if(TARGET openvino::frontend::paddlepaddle AND NOT TARGET ngraph::ov_paddlepaddle_frontend)
+    add_library(ngraph::ov_paddlepaddle_frontend INTERFACE IMPORTED)
+    set_target_properties(ngraph::ov_paddlepaddle_frontend PROPERTIES
        INTERFACE_LINK_LIBRARIES openvino::frontend::paddlepaddle)
 endif()

-if(TARGET openvino::frontend::tensorflow AND NOT TARGET ngraph::tensorflow_ov_frontend)
-    add_library(ngraph::tensorflow_ov_frontend INTERFACE IMPORTED)
-    set_target_properties(ngraph::tensorflow_ov_frontend PROPERTIES
+if(TARGET openvino::frontend::tensorflow AND NOT TARGET ngraph::ov_tensorflow_frontend)
+    add_library(ngraph::ov_tensorflow_frontend INTERFACE IMPORTED)
+    set_target_properties(ngraph::ov_tensorflow_frontend PROPERTIES
        INTERFACE_LINK_LIBRARIES openvino::frontend::tensorflow)
 endif()

 set(ngraph_ngraph_FOUND ON)
 set(NGRAPH_LIBRARIES ngraph::ngraph)

-set(ngraph_onnx_ov_frontend_FOUND ${OpenVINO_Frontend_ONNX_FOUND})
+set(ngraph_ov_onnx_frontend_FOUND ${OpenVINO_Frontend_ONNX_FOUND})
 set(ngraph_onnx_importer_FOUND ${OpenVINO_Frontend_ONNX_FOUND})

 if(ngraph_onnx_importer_FOUND)
-    set(ONNX_IMPORTER_LIBRARIES ngraph::onnx_ov_frontend)
+    set(ONNX_IMPORTER_LIBRARIES ngraph::ov_onnx_frontend)
    # ngraph::onnx_importer target and variables are deprecated
    # but need to create a dummy target for BW compatibility
    if(NOT TARGET ngraph::onnx_importer)
        add_library(ngraph::onnx_importer INTERFACE IMPORTED)
        set_target_properties(ngraph::onnx_importer PROPERTIES
-            INTERFACE_LINK_LIBRARIES ngraph::onnx_ov_frontend)
+            INTERFACE_LINK_LIBRARIES ngraph::ov_onnx_frontend)
    endif()
 endif()

--- a/docs/HOWTO/mo_extensions/front/tf/Complex.py
+++ b/docs/HOWTO/mo_extensions/front/tf/Complex.py
@ -2,12 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0

 #! [complex:transformation]
-import logging as log

-import numpy as np
-
-from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.graph.graph import Graph
+from openvino.tools.mo.front.common.replacement import FrontReplacementSubgraph
+from openvino.tools.mo.graph.graph import Graph


 class Complex(FrontReplacementSubgraph):
@ -41,4 +38,3 @@ class Complex(FrontReplacementSubgraph):
        # change the connection so now all consumers of "complex_node" get data from input node of strided slice nodes
        complex_node.out_port(0).get_connection().set_source(input_node_output_port)
 #! [complex:transformation]
-
--- a/docs/HOWTO/mo_extensions/front/tf/ComplexAbs.py
+++ b/docs/HOWTO/mo_extensions/front/tf/ComplexAbs.py
@ -4,11 +4,11 @@
 #! [complex_abs:transformation]
 import numpy as np

-from extensions.ops.elementwise import Pow
-from extensions.ops.ReduceOps import ReduceSum
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Graph, Node
-from mo.ops.const import Const
+from openvino.tools.mo.ops.elementwise import Pow
+from openvino.tools.mo.ops.ReduceOps import ReduceSum
+from openvino.tools.mo.front.common.replacement import FrontReplacementOp
+from openvino.tools.mo.graph.graph import Graph, Node
+from openvino.tools.mo.ops.const import Const


 class ComplexAbs(FrontReplacementOp):
--- a/docs/HOWTO/mo_extensions/front/tf/FFT_ext.py
+++ b/docs/HOWTO/mo_extensions/front/tf/FFT_ext.py
@ -3,8 +3,7 @@

 # ! [fft_ext:extractor]
 from ...ops.FFT import FFT
-from mo.front.extractor import FrontExtractorOp
-from mo.utils.error import Error
+from openvino.tools.mo.front.extractor import FrontExtractorOp


 class FFT2DFrontExtractor(FrontExtractorOp):
--- a/docs/HOWTO/mo_extensions/ops/FFT.py
+++ b/docs/HOWTO/mo_extensions/ops/FFT.py
@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0

 #! [fft:operation]
-from mo.front.common.partial_infer.elemental import copy_shape_infer
-from mo.graph.graph import Node, Graph
-from mo.ops.op import Op
+from openvino.tools.mo.front.common.partial_infer.elemental import copy_shape_infer
+from openvino.tools.mo.graph.graph import Graph
+from openvino.tools.mo.ops.op import Op


 class FFT(Op):
--- a/docs/IE_DG/Deep_Learning_Inference_Engine_DevGuide.md
+++ b/docs/IE_DG/Deep_Learning_Inference_Engine_DevGuide.md
@ -40,13 +40,13 @@ This library contains the classes to:

 Starting from 2022.1 release, OpenVINO Runtime introduced a concept of frontend plugins. Such plugins can be automatically dynamically loaded by OpenVINO Runtime dynamically depending on file format:
 * Linux* OS:
-    - `libir_ov_frontend.so` to read a network from IR
-    - `libpaddlepaddle_ov_frontend.so` to read a network from PaddlePaddle model format
-    - `libonnx_ov_frontend.so` to read a network from ONNX model format
+    - `libov_ir_frontend.so` to read a network from IR
+    - `libov_paddlepaddle_frontend.so` to read a network from PaddlePaddle model format
+    - `libov_onnx_frontend.so` to read a network from ONNX model format
 * Windows* OS:
-    - `ir_ov_frontend.dll` to read a network from IR
-    - `paddlepaddle_ov_frontend.dll` to read a network from PaddlePaddle model format
-    - `onnx_ov_frontend.dll` to read a network from ONNX model format
+    - `ov_ir_frontend.dll` to read a network from IR
+    - `ov_paddlepaddle_frontend.dll` to read a network from PaddlePaddle model format
+    - `ov_onnx_frontend.dll` to read a network from ONNX model format

 ### Device-Specific Plugin Libraries

--- a/docs/IE_DG/Extensibility_DG/Custom_ONNX_Ops.md
+++ b/docs/IE_DG/Extensibility_DG/Custom_ONNX_Ops.md
@ -62,7 +62,7 @@ The example below demonstrates how to unregister an operator from the destructor
 ## Requirements for Building with CMake

 A program that uses the `register_operator` functionality requires `openvino::core` and `openvino::frontend::onnx` libraries in addition to the OpenVINO Inference Runtime.
-The `onnx_ov_frontend` is a component of the `OpenVINO` package , so `find_package(OpenVINO REQUIRED COMPONENTS ONNX)` can find both.
+The `ov_onnx_frontend` is a component of the `OpenVINO` package , so `find_package(OpenVINO REQUIRED COMPONENTS ONNX)` can find both.
 Those libraries need to be passed to the `target_link_libraries` command in the CMakeLists.txt file.

 See CMakeLists.txt below for reference:
--- a/docs/IE_DG/inference_engine_intro.md
+++ b/docs/IE_DG/inference_engine_intro.md
@ -45,13 +45,13 @@ This library contains the classes to:

 Starting from 2022.1 release, OpenVINO Runtime introduced a concept of frontend plugins. Such plugins can be automatically dynamically loaded by OpenVINO Runtime dynamically depending on file format:
 * Unix* OS:
-    - `libir_ov_frontend.so` to read a network from IR
-    - `libpaddlepaddle_ov_frontend.so` to read a network from PaddlePaddle model format
-    - `libonnx_ov_frontend.so` to read a network from ONNX model format
+    - `libov_ir_frontend.so` to read a network from IR
+    - `libov_paddlepaddle_frontend.so` to read a network from PaddlePaddle model format
+    - `libov_onnx_frontend.so` to read a network from ONNX model format
 * Windows* OS:
-    - `ir_ov_frontend.dll` to read a network from IR
-    - `paddlepaddle_ov_frontend.dll` to read a network from PaddlePaddle model format
-    - `onnx_ov_frontend.dll` to read a network from ONNX model format
+    - `ov_ir_frontend.dll` to read a network from IR
+    - `ov_paddlepaddle_frontend.dll` to read a network from PaddlePaddle model format
+    - `ov_onnx_frontend.dll` to read a network from ONNX model format

 ### Device-specific Plugin Libraries ###

--- a/docs/MO_DG/prepare_model/customize_model_optimizer/Customize_Model_Optimizer.md
+++ b/docs/MO_DG/prepare_model/customize_model_optimizer/Customize_Model_Optimizer.md
@ -639,9 +639,9 @@ graph. Consider the extractor for the TensorFlow\* operation `Const` (refer to t
 `extensions/front/tf/const_ext.py`):

 ```py
-from mo.front.extractor import FrontExtractorOp
-from mo.front.tf.extractors.utils import tf_dtype_extractor, tf_tensor_shape, tf_tensor_content
-from mo.ops.const import Const
+from openvino.tools.mo.front.extractor import FrontExtractorOp
+from openvino.tools.mo.front.tf.extractors.utils import tf_dtype_extractor, tf_tensor_shape, tf_tensor_content
+from openvino.tools.mo.ops.const import Const


 class ConstExtractor(FrontExtractorOp):
@ -679,9 +679,9 @@ Consider another example with an extractor of ONNX\* operation `Constant` (refer
 from onnx import numpy_helper
 from onnx.numpy_helper import to_array

-from mo.front.extractor import FrontExtractorOp
-from mo.front.onnx.extractors.utils import onnx_attr
-from mo.ops.const import Const
+from openvino.tools.mo.front.extractor import FrontExtractorOp
+from openvino.tools.mo.front.onnx.extractors.utils import onnx_attr
+from openvino.tools.mo.ops.const import Const


 class ConstantExtractor(FrontExtractorOp):
@ -814,11 +814,11 @@ fusing of the sub-graph defining the [Mish](../../../ops/activation/Mish_4.md) a
 operation:

 ```py
-from extensions.front.Softplus_fusion import SoftplusFusion
-from extensions.ops.activation_ops import Mish
-from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.front.subgraph_matcher import SubgraphMatch
-from mo.graph.graph import Graph, rename_nodes
+from openvino.tools.mo.front.Softplus_fusion import SoftplusFusion
+from openvino.tools.mo.ops.activation_ops import Mish
+from openvino.tools.mo.front.common.replacement import FrontReplacementSubgraph
+from openvino.tools.mo.front.subgraph_matcher import SubgraphMatch
+from openvino.tools.mo.graph.graph import Graph, rename_nodes


 class MishFusion(FrontReplacementSubgraph):
@ -886,12 +886,12 @@ transformation.
 Consider an example transformation from the file is `extensions/front/Pack.py`  which replaces operation `Pack` from
 the TensorFlow\*:
 ```py
-from mo.front.common.partial_infer.utils import int64_array
-from mo.front.common.replacement import FrontReplacementOp
-from mo.front.tf.graph_utils import create_op_with_const_inputs
-from mo.graph.graph import Node, Graph, rename_nodes
-from mo.ops.concat import Concat
-from mo.ops.unsqueeze import Unsqueeze
+from openvino.tools.mo.front.common.partial_infer.utils import int64_array
+from openvino.tools.mo.front.common.replacement import FrontReplacementOp
+from openvino.tools.mo.front.tf.graph_utils import create_op_with_const_inputs
+from openvino.tools.mo.graph.graph import Node, Graph, rename_nodes
+from openvino.tools.mo.ops.concat import Concat
+from openvino.tools.mo.ops.unsqueeze import Unsqueeze


 class Pack(FrontReplacementOp):
@ -932,11 +932,11 @@ specification.
 ```py
 import logging as log

-from mo.front.common.partial_infer.utils import int64_array
-from mo.front.common.replacement import FrontReplacementPattern
-from mo.graph.graph import Graph
-from mo.ops.const import Const
-from mo.utils.error import Error
+from openvino.tools.mo.front.common.partial_infer.utils import int64_array
+from openvino.tools.mo.front.common.replacement import FrontReplacementPattern
+from openvino.tools.mo.graph.graph import Graph
+from openvino.tools.mo.ops.const import Const
+from openvino.tools.mo.utils.error import Error


 class SqueezeNormalize(FrontReplacementPattern):
@ -1200,13 +1200,13 @@ The example of the configuration file for this type of transformation is `extens
 and the corresponding transformation file is `./extensions/front/YOLO.py`:

 ```py
-from extensions.front.no_op_eraser import NoOpEraser
-from extensions.front.standalone_const_eraser import StandaloneConstEraser
-from extensions.ops.regionyolo import RegionYoloOp
-from mo.front.tf.replacement import FrontReplacementFromConfigFileGeneral
-from mo.graph.graph import Node, Graph
-from mo.ops.result import Result
-from mo.utils.error import Error
+from openvino.tools.mo.front.no_op_eraser import NoOpEraser
+from openvino.tools.mo.front.standalone_const_eraser import StandaloneConstEraser
+from openvino.tools.mo.ops.regionyolo import RegionYoloOp
+from openvino.tools.mo.front.tf.replacement import FrontReplacementFromConfigFileGeneral
+from openvino.tools.mo.graph.graph import Node, Graph
+from openvino.tools.mo.ops.result import Result
+from openvino.tools.mo.utils.error import Error


 class YoloRegionAddon(FrontReplacementFromConfigFileGeneral):
--- a/docs/MO_DG/prepare_model/customize_model_optimizer/Extending_MXNet_Model_Optimizer_with_New_Primitives.md
+++ b/docs/MO_DG/prepare_model/customize_model_optimizer/Extending_MXNet_Model_Optimizer_with_New_Primitives.md
@ -20,9 +20,9 @@ assume that we have already created the `CustomOp` class (inherited from `Op` cl
 for this MXNet custom operation as described in the [Customize_Model_Optimizer](Customize_Model_Optimizer.md).

 ```py
-from extension.ops.custom_op import CustomOp  # implementation of the MO operation class
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
-from mo.front.extractor import MXNetCustomFrontExtractorOp
+from openvino.tools.mo.ops.custom_op import CustomOp  # implementation of the MO operation class
+from openvino.tools.mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from openvino.tools.mo.front.extractor import MXNetCustomFrontExtractorOp

 class CustomProposalFrontExtractor(MXNetCustomFrontExtractorOp):  # inherit from specific base class
    op = 'MyCustomOp'  # the value corresponding to the `op_type` value of the MXNet operation
--- a/docs/MO_DG/prepare_model/customize_model_optimizer/Extending_Model_Optimizer_with_Caffe_Python_Layers.md
+++ b/docs/MO_DG/prepare_model/customize_model_optimizer/Extending_Model_Optimizer_with_Caffe_Python_Layers.md
@ -40,8 +40,8 @@ operation `ProposalOp` which corresponds to `Proposal` operation described in th
 document. Refer to the source code below for a detailed explanation of the extractor.

 ```py
-from extensions.ops.proposal import ProposalOp
-from mo.front.extractor import CaffePythonFrontExtractorOp
+from openvino.tools.mo.ops.proposal import ProposalOp
+from openvino.tools.mo.front.extractor import CaffePythonFrontExtractorOp


 class ProposalPythonFrontExtractor(CaffePythonFrontExtractorOp):
--- a/docs/snippets/CMakeLists.txt
+++ b/docs/snippets/CMakeLists.txt
@ -46,7 +46,7 @@ if(OpenCV_FOUND)
 endif()

 if(ENABLE_OV_ONNX_FRONTEND)
-    target_link_libraries(${TARGET_NAME} PRIVATE onnx_ov_frontend)
+    target_link_libraries(${TARGET_NAME} PRIVATE ov_onnx_frontend)
 endif()

 if(NOT MSVC)
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@ -1134,8 +1134,38 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
        if (!isSuitableParent1 && !isSuitableParent2)
            continue;

-        auto mergedConv = isSuitableParent1 ? parent1 : parent2;
-        auto peerNode = isSuitableParent1 ? parent2 : parent1;
+        std::shared_ptr<MKLDNNNode> mergedConv;
+        std::shared_ptr<MKLDNNNode> peerNode;
+
+        if (isSuitableParent1 && isSuitableParent2) {
+            // not merged operation (peerNode) has to be in low precision
+            const auto isBranchQuantized = [](const MKLDNNNodePtr& branchParent) {
+                const auto& fused = branchParent->getFusedWith();
+                const auto branchPrecision = fused.empty() ?
+                        branchParent->getOriginalOutputPrecisionAtPort(0) :
+                        fused[fused.size() - 1]->getOriginalOutputPrecisionAtPort(0);
+                return (branchPrecision == Precision::I8) || (branchPrecision == Precision::U8);
+            };
+
+            const auto isBranch1Quantized = isBranchQuantized(graphNode->getParentEdgesAtPort(0)[0]->getParent());
+            const auto isBranch2Quantized = isBranchQuantized(graphNode->getParentEdgesAtPort(1)[0]->getParent());
+            if (isBranch1Quantized || isBranch2Quantized) {
+                // INT8
+                const auto parent1CanBeMerged = parent1->getChildEdges().size() == 1ul;
+
+                // if both branches are quantized, then parent1 is selected (result is not changed)
+                mergedConv = isBranch2Quantized && parent1CanBeMerged ? parent1 : parent2;
+                peerNode = isBranch2Quantized && parent1CanBeMerged ? parent2 : parent1;
+            } else {
+                // original FP32
+                mergedConv = isSuitableParent1 ? parent1 : parent2;
+                peerNode = isSuitableParent1 ? parent2 : parent1;
+            }
+        } else {
+            mergedConv = isSuitableParent1 ? parent1 : parent2;
+            peerNode = isSuitableParent1 ? parent2 : parent1;
+        }
+
        if (isSuitableParent1 && isSuitableParent2) {
            if ((peerNode->getType() == Convolution || peerNode->getType() == BinaryConvolution) &&
                mergedConv->getChildEdges().size() != 1) {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -1102,7 +1102,7 @@ Layout MKLDNNNode::getWeightsLayoutByDims(SizeVector dims, bool isGrouped) {
    }
 }

-void MKLDNNNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
+void MKLDNNNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims) {
    IE_THROW() << "Fusing of " << this->getType() << " operation is not implemented";
 }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -602,7 +602,7 @@ protected:
     * Seed node should call this routine and pass its post operations list as parameter.
     * @param ops List of fused post operations
     */
-    virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, int align = -1);
+    virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims);
    virtual void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem);

    virtual std::shared_ptr<mkldnn::primitive_attr> initPrimitiveAttr() { return nullptr; }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp
@ -1132,8 +1132,7 @@ void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr) {
                ops.append_sum(1.0);
            } else {
                // TODO [DS]: change to shape from memory
-                constexpr int align = 16;
-                eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align);
+                eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims());
            }
            continue;
        }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@ -352,8 +352,7 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const Vecto
                ops.append_sum(1.0, MKLDNNExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
            } else {
                if (useLegacyPostOps || eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
-                    constexpr int align = 16;
-                    eltwiseNode->appendPostOps(ops, dims, align);
+                    eltwiseNode->appendPostOps(ops, dims);
                } else {
                    eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
                }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@ -365,9 +365,8 @@ void MKLDNNDeconvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const Vec
    for (auto &node : fusedWith) {
        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
            // TODO [DS]: change to shape from memory
-            constexpr int align = 16;
            // use legacy depthwise since backprop convolution does not support binary post ops
-            eltwiseNode->appendPostOps(ops, dims, align);
+            eltwiseNode->appendPostOps(ops, dims);
            continue;
        }
        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@ -1744,7 +1744,7 @@ void MKLDNNEltwiseNode::fuseInto(MKLDNNNodePtr& parentNode) {
    MKLDNNNode::fuseInto(parentNode);
 }

-void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
+void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims) {
    const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' ";

    if (getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
@ -1775,11 +1775,11 @@ void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &p
        }
    } else {
        const size_t chIdx = postOpDims.size() > 1 ? getFusingAxis() : 0;
+        constexpr int align = 16; // always align for legacy scale/shift post ops
        scalesBuffer = makeAlignedBuffer(postOpDims[chIdx], scales, align);
        if (getAlgorithm() != EltwisePrelu) {
            shiftsBuffer = makeAlignedBuffer(postOpDims[chIdx], shifts, align);
        }
-
        /* @todo legacy depthwise post ops are kept for now
         * for performance reasons
         */
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@ -75,7 +75,7 @@ public:
    bool created() const override;
    bool canBeInPlace() const override;
    bool canFuse(const MKLDNNNodePtr& node) const override;
-    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1) override;
+    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims) override;
    void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) override;
    void fuseInto(MKLDNNNodePtr& parentNode) override;
    InferenceEngine::Precision getRuntimePrecision() const override;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
@ -1706,8 +1706,13 @@ void MKLDNNFakeQuantizeNode::initializePostOpData(const VectorDims &dims, const
    isPostOpDataInitialized = true;
 }

-void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
-    initializePostOpData(postOpDims, align);
+void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims) {
+    // MKLDNN quantization_injectors assumes that quantization data memory is always aligned on 16
+    // by length of AVX512 vector register which is also enough for AVX2 and SSE42 implementations.
+    // Otherwise it can lead to buffer over-read and performance penalties due to denormals.
+    const size_t bufferAlignment = 16;
+
+    initializePostOpData(postOpDims, bufferAlignment);

    if (getAlgorithm() == FQBinarization) {
        ops.append_binarization(mkldnn::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
@ -120,10 +120,7 @@ public:
    InferenceEngine::Precision getInputPrecision() const { return inputPrecision; }
    InferenceEngine::Precision getOutputPrecision() const { return outputPrecision; }

-    // MKLDNN quantization_injectors assumes that quantization data memory is always aligned on 16
-    // by length of AVX512 vector register which is also enough for AVX2 and SSE42 implementations.
-    // Otherwise it can lead to buffer over-read and performance penalties due to denormals.
-    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims = {}, int align = 16) override;
+    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims = {}) override;
    void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) override;

    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
@ -198,9 +198,8 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini

        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
            // TODO [DS]: change to shape from memory
-            constexpr int align = -1;
            if (eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
-                eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align);
+                eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims());
            } else {
                eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
            }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
@ -109,7 +109,7 @@ protected:
        uni_vmovdqu(b, a);                   // b = a
        uni_vmovdqu(c, a);                   // c = a
        uni_vpcmpeqd(b, b, zero);               // if (a == 0) b = 1 else b = 0
-        uni_vpand(c, mask);                  // c = a & 01111111100000000000000000000000
+        uni_vpand(c, c, mask);                  // c = a & 01111111100000000000000000000000
        uni_vpcmpeqd(c, c, zero);               // if (c == 0) c = 1 else c = 0
        uni_vtestps(b, c);                    // if ((!b & c) == 0) CF = 1 else CF = 0
    }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
@ -2102,8 +2102,7 @@ void MKLDNNInterpolateNode::setPostOps(mkldnn::primitive_attr &attr, const Vecto

        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
        if (eltwiseNode) {
-            constexpr int align = 16;
-            eltwiseNode->appendPostOps(ops, dims, align);
+            eltwiseNode->appendPostOps(ops, dims);
            continue;
        }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
@ -891,8 +891,7 @@ void MKLDNNMVNNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {

        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
        if (eltwiseNode) {
-            constexpr int align = 16;
-            eltwiseNode->appendPostOps(ops, postOpDims, align);
+            eltwiseNode->appendPostOps(ops, postOpDims);
            continue;
        }
        IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp
@ -813,8 +813,7 @@ void MKLDNNNormalizeL2Node::setPostOps(mkldnn::primitive_attr& kernel_attrs, con

        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
        if (eltwiseNode) {
-            constexpr int align = 16;
-            eltwiseNode->appendPostOps(ops, dims, align);
+            eltwiseNode->appendPostOps(ops, dims);
            continue;
        }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
@ -2779,8 +2779,7 @@ void MKLDNNReduceNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims

        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
        if (eltwiseNode) {
-            constexpr int align = 16;
-            eltwiseNode->appendPostOps(ops, postOpDims, align);
+            eltwiseNode->appendPostOps(ops, postOpDims);
            continue;
        }
        IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/backend/backend.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/backend/backend.hpp
@ -54,6 +54,11 @@ private:
            const mv_blob_header& blobHdr,
            std::vector<char>& blob);

+    void serializeParamsAndResults(
+            const Model& model,
+            const mv_blob_header& blobHdr,
+            std::vector<char>& blob);
+
    ElfN_Ehdr createElfHeader();

    void getMetaData(
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_format.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/backend/blob_format.hpp
@ -62,4 +62,16 @@ VPU_PACKED(mv_stage_header {
    uint32_t numShaves;
 };)

+VPU_PACKED(network_info_header {
+    uint32_t parameters_size;
+    uint32_t results_size;
+};)
+
+VPU_PACKED(network_params_header {
+    uint32_t name_lenght;
+    uint32_t shape_size;
+    uint32_t element_type_bytesize;
+    uint32_t output_tensor_names_size;
+};)
+
 }  //  namespace vpu
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/blob_reader.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/blob_reader.hpp
@ -26,6 +26,9 @@ public:
    const ie::InputsDataMap& getNetworkInputs() const { return _networkInputs; }
    const ie::OutputsDataMap& getNetworkOutputs() const { return _networkOutputs; }

+    const std::vector<std::shared_ptr<const ov::Node>>& getNetworkParemeters() const { return _parameters; }
+    const std::vector<std::shared_ptr<const ov::Node>>& getNetworkResults() const { return _results; }
+
    uint32_t getStageCount() const { return _blobHeader.stages_count; }

    uint32_t getMagicNumber() const { return _blobHeader.magic_number; }
@ -36,6 +39,8 @@ public:
    uint32_t getNumberOfShaves() const { return _blobHeader.number_of_shaves; }
    uint32_t getNumberOfSlices() const { return _blobHeader.number_of_cmx_slices; }

+    uint32_t getFileSize() const { return _blobHeader.file_size; }
+
    const DataInfo& getInputInfo()  const { return _inputInfo; }
    const DataInfo& getOutputInfo() const { return _outputInfo; }

@ -49,6 +54,9 @@ private:
    ie::InputsDataMap  _networkInputs;
    ie::OutputsDataMap _networkOutputs;

+    std::vector<std::shared_ptr<const ov::Node>> _parameters = {};
+    std::vector<std::shared_ptr<const ov::Node>> _results = {};
+
    DataInfo _inputInfo;
    DataInfo _outputInfo;
 };
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp
@ -89,6 +89,7 @@ std::set<std::string> getSupportedLayers(const ie::CNNNetwork& network, const Pl

 const uint32_t BLOB_MAGIC_NUMBER  = 9709;
 const uint32_t BLOB_VERSION_MAJOR = 6;
+// Must be changed when possible
 const uint32_t BLOB_VERSION_MINOR = 0;

 }  // namespace vpu
--- a/inference-engine/src/vpu/graph_transformer/src/backend/serialize.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/backend/serialize.cpp
@ -15,6 +15,9 @@
 #include <description_buffer.hpp>
 #include <xml_parse_utils.h>

+#include <ngraph/ops.hpp>
+#include <transformations/utils/utils.hpp>
+
 #include <climits>
 #include <cstring>
 #include <string>
@ -161,6 +164,118 @@ void BackEnd::serializeConstShapes(const Model& model, const mv_blob_header& blo
    }
 }

+void BackEnd::serializeParamsAndResults(const Model& model, const mv_blob_header& blobHdr,
+                        std::vector<char>& blob) {
+    const auto networkParams = model->attrs().getOrDefault<ov::ParameterVector>("networkParameters");
+    const auto networkResults = model->attrs().getOrDefault<ov::ResultVector>("networkResults");
+
+    auto getNetworkParameterHeader = [](const std::shared_ptr<ov::Node>& node) {
+        network_params_header nph;
+        nph.element_type_bytesize = sizeof(node->get_element_type().operator ov::element::Type_t());
+        nph.name_lenght = node->get_friendly_name().size();
+        nph.shape_size = node->get_shape().size();
+        nph.output_tensor_names_size = node->get_output_tensor(0).get_names().size();
+        return nph;
+    };
+
+    uint32_t networkInfoOffset = blob.size();
+    auto serializeParameters = [&blob, &networkInfoOffset,
+                                &getNetworkParameterHeader](
+                                const std::shared_ptr<ov::Node>& node) {
+        BlobSerializer headerSerializer;
+        BlobSerializer shapeSerializer;
+        BlobSerializer elementTypeSerializer;
+        BlobSerializer tensorNamesSerializer;
+        BlobSerializer inputNameForResultSerializer;
+
+        const auto nph = getNetworkParameterHeader(node);
+        const bool isResult = ov::is_type<ov::op::v0::Result>(node);
+        int totalNetworkInfoOffset =
+            networkInfoOffset + sizeof(nph) + nph.name_lenght +
+            nph.element_type_bytesize +
+            sizeof(size_t) * (nph.output_tensor_names_size + nph.shape_size);
+
+        for (const auto& name : node->get_output_tensor(0).get_names()) {
+            totalNetworkInfoOffset += sizeof(size_t) + name.size();
+        }
+        if (isResult) {
+            totalNetworkInfoOffset +=
+                sizeof(size_t) +
+                ngraph::op::util::create_ie_output_name(node->input_value(0)).size();
+        }
+
+        blob.resize(totalNetworkInfoOffset);
+
+        headerSerializer.append(nph);
+        std::copy_n(headerSerializer.data(), sizeof(nph),
+                    blob.data() + networkInfoOffset);
+
+        networkInfoOffset += sizeof(nph);
+        const auto nodeName = node->get_friendly_name();
+        VPU_THROW_UNLESS(
+            node->get_output_partial_shape(0).rank().is_static(),
+            "Serialization of shapes with dynamic rank is not supported");
+        const auto nodeShape = node->get_output_partial_shape(0).get_shape();
+        const auto nodeElType =
+            node->get_element_type().operator ov::element::Type_t();
+
+        std::copy_n(nodeName.data(), nodeName.size(),
+                    blob.data() + networkInfoOffset);
+        networkInfoOffset += nph.name_lenght;
+
+        for (const auto shapeIdx : nodeShape) {
+            shapeSerializer.append(shapeIdx);
+        }
+        std::copy_n(shapeSerializer.data(),
+                    shapeSerializer.size(), blob.data() + networkInfoOffset);
+        networkInfoOffset += shapeSerializer.size();
+        elementTypeSerializer.append(nodeElType);
+        std::copy_n(elementTypeSerializer.data(), nph.element_type_bytesize,
+                    blob.data() + networkInfoOffset);
+        networkInfoOffset += nph.element_type_bytesize;
+
+        for (const auto& name : node->get_output_tensor(0).get_names()) {
+            tensorNamesSerializer.append(name.size());
+            for (const auto ch : name) {
+                tensorNamesSerializer.append(ch);
+            }
+        }
+        std::copy_n(tensorNamesSerializer.data(), tensorNamesSerializer.size(),
+                    blob.data() + networkInfoOffset);
+        networkInfoOffset += tensorNamesSerializer.size();
+
+        if (isResult) {
+            const auto inputNameForResult =
+                ngraph::op::util::create_ie_output_name(node->input_value(0));
+            inputNameForResultSerializer.append(inputNameForResult.size());
+            for (const auto ch : inputNameForResult) {
+                inputNameForResultSerializer.append(ch);
+            }
+            std::copy_n(inputNameForResultSerializer.data(),
+                        inputNameForResultSerializer.size(),
+                        blob.data() + networkInfoOffset);
+            networkInfoOffset += inputNameForResultSerializer.size();
+        }
+    };
+
+    BlobSerializer networkInfoSerializer;
+    network_info_header nih;
+    nih.parameters_size = networkParams.size();
+    nih.results_size = networkResults.size();
+    blob.resize(networkInfoOffset + sizeof(nih));
+    networkInfoSerializer.append(nih);
+    std::copy_n(networkInfoSerializer.data(), sizeof(nih), blob.data() + networkInfoOffset);
+    networkInfoOffset += sizeof(nih);
+
+    for (const auto& param : networkParams) {
+        serializeParameters(param);
+    }
+
+    for (const auto& result : networkResults) {
+        serializeParameters(result);
+    }
+}
+
 void BackEnd::serialize(
        const Model& model,
        std::vector<char>& blob,
@ -271,6 +386,12 @@ void BackEnd::serialize(

    serializeConstData(model, blobHdr, blob);
    serializeConstShapes(model, blobHdr, blob);
+    const auto networkParams = model->attrs().getOrDefault<ov::ParameterVector>("networkParameters");
+    const auto networkResults = model->attrs().getOrDefault<ov::ResultVector>("networkResults");
+    // To avoid constant network case
+    if (!networkParams.empty() && !networkResults.empty()) {
+        serializeParamsAndResults(model, blobHdr, blob);
+    }

    blobHeader.first = blob.data();
    blobHeader.second = sizeof(ElfN_Ehdr) + sizeof(mv_blob_header);
--- a/inference-engine/src/vpu/graph_transformer/src/blob_reader.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/blob_reader.cpp
@ -10,7 +10,7 @@
 #include <string>

 #include <ie_input_info.hpp>
-
+#include <ie_ngraph_utils.hpp>
 #include <vpu/graph_transformer.hpp>
 #include <vpu/backend/blob_format.hpp>
 #include <vpu/model/data.hpp>
@ -116,6 +116,64 @@ void BlobReader::parse(const std::vector<char>& blob) {
            _networkOutputs[processedOutput.getName()] = std::make_shared<ie::Data>(processedOutput);
        }
    }
+    if (blob.size() != _blobHeader.file_size) {
+        auto networkInfoOffset = _blobHeader.file_size;
+        const auto nih = readFromBlob<network_info_header>(blob, networkInfoOffset);
+        auto extractParameter = [&blob, &networkInfoOffset](bool isResult) {
+            const auto nph = readFromBlob<network_params_header>(blob, networkInfoOffset);
+            std::string parameterFriendlyName(nph.name_lenght, '0');
+
+            for (auto idx = 0; idx < nph.name_lenght; ++idx) {
+                parameterFriendlyName[idx] = readFromBlob<char>(blob, networkInfoOffset);
+            }
+
+            ov::Shape parameterShape(nph.shape_size);
+            for (auto idx = 0; idx < nph.shape_size; ++idx) {
+                parameterShape[idx] = readFromBlob<size_t>(blob, networkInfoOffset);
+            }
+
+            ov::element::Type_t parameterType = readFromBlob<ov::element::Type_t>(blob, networkInfoOffset);
+            std::shared_ptr<ov::Node> parameter =
+                std::make_shared<ov::op::v0::Parameter>(parameterType,
+                                                        parameterShape);
+
+            std::unordered_set<std::string> tensorNames;
+            for (auto idx = 0; idx < nph.output_tensor_names_size; ++idx) {
+                const auto nameLenght = readFromBlob<size_t>(blob, networkInfoOffset);
+                std::string tensorName;
+                for (auto nameSymbolIdx = 0; nameSymbolIdx < nameLenght; ++nameSymbolIdx) {
+                    tensorName += readFromBlob<char>(blob, networkInfoOffset);
+                }
+                tensorNames.insert(tensorName);
+            }
+            if (isResult) {
+                auto fakeParameter = parameter;
+                parameter = std::make_shared<ov::op::v0::Result>(parameter);
+
+                const auto inputNameLenght = readFromBlob<size_t>(blob, networkInfoOffset);
+
+                std::string inputName;
+                for (auto nameSymbolIdx = 0; nameSymbolIdx < inputNameLenght; ++nameSymbolIdx) {
+                    inputName += readFromBlob<char>(blob, networkInfoOffset);
+                }
+                fakeParameter->set_friendly_name(inputName);
+
+                parameter = parameter->copy_with_new_inputs({fakeParameter});
+            }
+            parameter->set_friendly_name(parameterFriendlyName);
+            parameter->output(0).get_tensor().set_names(tensorNames);
+
+            return parameter;
+        };
+
+        for (auto paramIdx = 0; paramIdx < nih.parameters_size; ++paramIdx) {
+            _parameters.emplace_back(extractParameter(false));
+        }
+
+        for (auto paramIdx = 0; paramIdx < nih.results_size; ++paramIdx) {
+            _results.emplace_back(extractParameter(true));
+        }
+    }
 }

 }  // namespace vpu
--- a/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp
@ -492,7 +492,14 @@ ModelPtr FrontEnd::runCommonPasses(ie::CNNNetwork network,

    model->attrs().set<int>("index", g_counter.fetch_add(1));
    model->attrs().set<Resources>("resources", env.resources);
-
+    // Transmitting Information about the parameters/results of the network for
+    // the possibility of importing it
+    if (network.getFunction() != nullptr) {
+        model->attrs().set<ov::ParameterVector>(
+            "networkParameters", network.getFunction()->get_parameters());
+        model->attrs().set<ov::ResultVector>(
+            "networkResults", network.getFunction()->get_results());
+    }
    //
    // Update IE Network
    //
--- a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
@ -164,11 +164,20 @@ void ExecutableNetwork::Import(std::istream& strm, std::vector<DevicePtr> &devic

    this->_networkInputs  = blobReader.getNetworkInputs();
    this->_networkOutputs = blobReader.getNetworkOutputs();
-    std::size_t numStages = blobReader.getStageCount();
-    auto blobHeader = blobReader.getHeader();
+    if (blobSize == blobReader.getFileSize()) {
+        _log->warning(
+            "Older version of blob. Unable to get information about network "
+            "parameters/results. Please recompile blob");
+    }
+    this->setInputs(blobReader.getNetworkParemeters());
+    this->setOutputs(blobReader.getNetworkResults());

    _inputInfo  = blobReader.getInputInfo();
    _outputInfo = blobReader.getOutputInfo();
+
+    std::size_t numStages = blobReader.getStageCount();
+    auto blobHeader = blobReader.getHeader();
+
    openDevice(devicePool);
    _executor->allocateGraph(_device, _graphDesc, _graphBlob, blobHeader, numStages, networkName, _actualNumExecutors);
    _graphMetaData.stagesMeta.resize(numStages);
--- a/inference-engine/thirdparty/clDNN/api/intel_gpu/primitives/slice.hpp
+++ b/inference-engine/thirdparty/clDNN/api/intel_gpu/primitives/slice.hpp
@ -0,0 +1,37 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+#include "primitive.hpp"
+
+namespace cldnn {
+/// @addtogroup cpp_api C++ API
+/// @{
+/// @addtogroup cpp_topology Network Topology
+/// @{
+/// @addtogroup cpp_primitives Primitives
+/// @{
+
+/// @brief
+/// @details
+struct slice : public primitive_base<slice> {
+    CLDNN_DECLARE_PRIMITIVE(slice)
+
+    /// @brief Constructs slice primitive.
+    /// @param id This primitive id.
+    /// @param inputs List of primitive ids.
+    slice(const primitive_id& id,
+                  const std::vector<primitive_id>& inputs,
+                  const tensor output_shape,
+                  const primitive_id& ext_prim_id = "",
+                  const padding& output_padding = padding())
+        : primitive_base{id, inputs, ext_prim_id, output_padding},
+          output_shape {output_shape}
+    {}
+
+    tensor output_shape;
+};
+/// @}
+/// @}
+/// @}
+}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
+++ b/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
@ -24,6 +24,7 @@ private:
    debug_configuration();
 public:
    static const char *prefix;
+    int help;                       // Print help messages
    int verbose;                    // Verbose execution
    int print_multi_kernel_perf;    // Print execution time of each kernel in multi-kernel primitimive
    int disable_usm;                // Disable usm usage
@ -34,6 +35,7 @@ public:
    std::string dump_layers;        // Dump intermediate buffers of specified layers only, separated by space
    std::string dry_run_path;       // Dry run and serialize execution graph into the specified path
    int dump_layers_dst_only;       // Dump only output of layers
+    int dump_layers_limit_batch;    // Limit the size of batch to dump
    int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
    static const debug_configuration *get_instance();
 };
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
@ -55,6 +55,7 @@ enum class KernelType {
    DEPTH_TO_SPACE,
    BATCH_TO_SPACE,
    SHUFFLE_CHANNELS,
+    SLICE,
    STRIDED_SLICE,
    REVERSE_SEQUENCE,
    BINARY_CONVOLUTION,
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_ref.cpp
@ -0,0 +1,111 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include"slice_kernel_ref.h"
+#include <kernel_selector_utils.h>
+#include <vector>
+
+namespace {
+
+void addJitConstantsForAttribute(kernel_selector::JitConstants &jit,
+        const std::string &name, const std::vector<std::int32_t> &attribute) {
+    using namespace kernel_selector;
+    jit.AddConstant(MakeJitConstant(name + "_BATCH", attribute[0]));
+    jit.AddConstant(MakeJitConstant(name + "_FEATURE", attribute[1]));
+    if (attribute.size() == 5) {  // BFZYX
+        jit.AddConstant(MakeJitConstant(name + "_Z", attribute[2]));
+        jit.AddConstant(MakeJitConstant(name + "_Y", attribute[3]));
+        jit.AddConstant(MakeJitConstant(name + "_X", attribute[4]));
+    } else {  // BFYX
+        jit.AddConstant(MakeJitConstant(name + "_Y", attribute[2]));
+        jit.AddConstant(MakeJitConstant(name + "_X", attribute[3]));
+    }
+}
+
+} // anonymous namespace
+
+namespace kernel_selector {
+
+KernelsData SliceKernelRef::GetKernelsData(const Params &params,
+        const optional_params &options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+    KernelData kernel_data = KernelData::Default<slice_params>(params);
+    slice_params &new_params =
+            dynamic_cast<slice_params&>(*kernel_data.params.get());
+    auto dispatch_data = SetDefault(new_params, options);
+    auto entry_point = GetEntryPoint(kernelName, new_params.layerID, params, options);
+    auto slice_specific_jit = GetJitConstants(new_params);
+    auto jit = CreateJit(kernelName, slice_specific_jit, entry_point);
+
+    FillCLKernelData(kernel_data.kernels[0], dispatch_data, params.engineInfo,
+            kernelName, jit, entry_point);
+
+    return {kernel_data};
+}
+
+KernelsPriority SliceKernelRef::GetKernelsPriority(const Params&/*params*/,
+        const optional_params&/*options*/) const {
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+}
+
+ParamsKey SliceKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::INT64);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    return k;
+}
+
+bool SliceKernelRef::Validate(const Params &p, const optional_params &o) const {
+    if (p.GetType() != KernelType::SLICE || o.GetType() != KernelType::SLICE) {
+        return false;
+    }
+
+    const slice_params &params = dynamic_cast<const slice_params&>(p);
+    if (params.inputs.empty())
+        return false;
+
+    if (params.output.Dimentions() > 5 || params.inputs[0].Dimentions() > 5)
+        return false;
+
+    return true;
+}
+
+JitConstants SliceKernelRef::GetJitConstants(const slice_params &params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+    addJitConstantsForAttribute(jit, "SLICE_BEGIN", params.start);
+    addJitConstantsForAttribute(jit, "SLICE_END", params.end);
+    addJitConstantsForAttribute(jit, "SLICE_STEP", params.step);
+    return jit;
+}
+
+CommonDispatchData SliceKernelRef::SetDefault(const slice_params &params,
+        const optional_params&) const {
+    CommonDispatchData dispatchData;
+    dispatchData.gws = { params.output.Batch().v, params.output.Feature().v,
+            params.output.Z().v * params.output.Y().v * params.output.X().v };
+
+    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws,
+            params.engineInfo);
+
+    return dispatchData;
+}
+
+} // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_ref.h
@ -0,0 +1,42 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+#include <vector>
+
+namespace kernel_selector {
+
+struct slice_params: public base_params {
+    slice_params() : base_params(KernelType::SLICE) {}
+
+    std::vector<std::int32_t> start;
+    std::vector<std::int32_t> end;
+    std::vector<std::int32_t> step;
+};
+
+struct slice_optional_params : optional_params {
+    slice_optional_params() : optional_params(KernelType::SLICE) {}
+};
+
+class SliceKernelRef: public KernelBaseOpenCL {
+public:
+    SliceKernelRef() :
+            KernelBaseOpenCL { "slice_ref" } {
+    }
+    KernelsData GetKernelsData(const Params &params,
+            const optional_params &options) const override;
+    KernelsPriority GetKernelsPriority(const Params &params,
+            const optional_params &options) const override;
+    ParamsKey GetSupportedKey() const override;
+    bool Validate(const Params &p, const optional_params &o) const override;
+
+private:
+    JitConstants GetJitConstants(const slice_params &params) const;
+    CommonDispatchData SetDefault(const slice_params &params,
+            const optional_params&) const;
+};
+
+} // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_selector.cpp
@ -0,0 +1,18 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "slice_kernel_selector.h"
+#include "slice_kernel_ref.h"
+
+namespace kernel_selector {
+
+slice_kernel_selector::slice_kernel_selector() {
+    Attach<SliceKernelRef>();
+}
+
+KernelsData slice_kernel_selector::GetBestKernels(const Params &params,
+        const optional_params &options) const {
+    return GetNaiveBestKernel(params, options, KernelType::SLICE);
+}
+
+} // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_selector.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/slice/slice_kernel_selector.h
@ -0,0 +1,23 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <kernel_selector.h>
+
+namespace kernel_selector {
+
+class slice_kernel_selector : public kernel_selector_base {
+public:
+    static slice_kernel_selector& Instance() {
+        static slice_kernel_selector instance_;
+        return instance_;
+    }
+
+    slice_kernel_selector();
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+
+} // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/slice_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/slice_ref.cl
@ -0,0 +1,36 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+
+KERNEL(slice_ref)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output)
+{
+    const uint batch = get_global_id(0);
+    const uint feature = get_global_id(1);
+#if INPUT0_DIMS <= 4
+    const uint xy = get_global_id(2);
+    const uint y = xy / OUTPUT_SIZE_X;
+    const uint x = xy % OUTPUT_SIZE_X;
+    const uint output_index = OUTPUT_GET_INDEX(batch, feature, y, x);
+    const uint input_index = INPUT0_GET_INDEX(
+        SLICE_BEGIN_BATCH + batch * SLICE_STEP_BATCH,
+        SLICE_BEGIN_FEATURE + feature * SLICE_STEP_FEATURE,
+        SLICE_BEGIN_Y + y * SLICE_STEP_Y,
+        SLICE_BEGIN_X + x * SLICE_STEP_X);
+#elif INPUT0_DIMS == 5
+    const uint xyz = get_global_id(2);
+    const uint yx = xyz % (OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
+    const uint z = xyz / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
+    const uint y = yx / OUTPUT_SIZE_X;
+    const uint x = yx % OUTPUT_SIZE_X;
+    const uint output_index = OUTPUT_GET_INDEX(batch, feature, z, y, x);
+    const uint input_index = INPUT0_GET_INDEX(
+        SLICE_BEGIN_BATCH + batch * SLICE_STEP_BATCH,
+        SLICE_BEGIN_FEATURE + feature * SLICE_STEP_FEATURE,
+        SLICE_BEGIN_Z + z * SLICE_STEP_Z,
+        SLICE_BEGIN_Y + y * SLICE_STEP_Y,
+        SLICE_BEGIN_X + x * SLICE_STEP_X);
+#endif
+    output[output_index] = ACTIVATION(input[input_index], ACTIVATION_PARAMS);
+}
--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@ -3,7 +3,9 @@
 //

 #include "intel_gpu/runtime/debug_configuration.hpp"
+#include <algorithm>
 #include <iostream>
+#include <iomanip>
 #include <memory>
 #include <vector>
 #include <sstream>
@ -97,10 +99,39 @@ void get_common_debug_env_var(const std::string &var, T &val) {
    return get_debug_env_var(var, val, allowed_option_prefixes);
 }

+static void print_help_messages() {
+    std::vector<std::pair<std::string, std::string>> message_list;
+    message_list.emplace_back("OV_GPU_Help", "Print help messages");
+    message_list.emplace_back("OV_GPU_Verbose", "Verbose execution");
+    message_list.emplace_back("OV_GPU_PrintMultiKernelPerf", "Print execution time of each kernel in multi-kernel primitimive");
+    message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage");
+    message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)");
+    message_list.emplace_back("OV_GPU_DumpGraphs", "Dump optimized graph");
+    message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources");
+    message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path");
+    message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space");
+    message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
+    message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
+    message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
+    message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
+
+    auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
+        [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
+            return a.first.size() < b.first.size();
+    });
+    int name_width = static_cast<int>(max_name_length_item->first.size()) + 2;
+
+    GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl;
+    for (auto& p : message_list) {
+        GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl;
+    }
+}
+
 #endif

 debug_configuration::debug_configuration()
-        : verbose(0)
+        : help(0)
+        , verbose(0)
        , print_multi_kernel_perf(0)
        , disable_usm(0)
        , dump_graphs(std::string())
@ -110,8 +141,10 @@ debug_configuration::debug_configuration()
        , dump_layers_dst_only(0)
        , dry_run_path(std::string())
        , disable_onednn(0)
+        , dump_layers_limit_batch(std::numeric_limits<int>::max())
        , base_batch_for_memory_estimation(-1) {
 #ifdef GPU_DEBUG_CONFIG
+    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
    get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf);
    get_gpu_debug_env_var("DisableUsm", disable_usm);
@ -120,10 +153,16 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
    get_gpu_debug_env_var("DumpLayers", dump_layers);
    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
+    get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
    get_gpu_debug_env_var("DisableOnednn", disable_onednn);
    get_gpu_debug_env_var("DryRunPath", dry_run_path);
    get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);

+    if (help > 0) {
+        print_help_messages();
+        exit(0);
+    }
+
    if (dump_layers.length() > 0)
        dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
 #endif
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_onednn_optimization_attributes.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_onednn_optimization_attributes.cpp
@ -7,12 +7,34 @@
 #include "pass_manager.h"
 #include "program_node.h"

+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include "fully_connected_inst.h"
+#include <impls/onednn/utils.hpp>
+#endif
+
 using namespace cldnn;

 void add_onednn_optimization_attributes::run(program& p) {
 #ifdef ENABLE_ONEDNN_FOR_GPU
    for (auto& node : p.get_processing_order()) {
        if (node->get_preferred_impl_type() == impl_types::onednn) {
+            if (node->is_type<fully_connected>()) {
+                auto fc_prim = node->as<fully_connected>().get_primitive();
+
+                // Reshape fused ops tensors for OneDNN FC if needed
+                if (fc_prim->input_size == 3) {
+                    for (auto& fused_prim : node->get_fused_primitives()) {
+                        auto fused_node = fused_prim.node;
+                        if (fused_node->is_type<eltwise>()) {
+                            auto& dependency = node->get_dependency(fused_prim.dep_start_idx);
+                            auto original_layout = dependency.get_output_layout();
+                            onednn::combine_bf_with_first_spatial_dim(original_layout);
+                            dependency.set_output_layout(original_layout, false);
+                        }
+                    }
+                }
+            }
+
            node->init_onednn_primitive_attributes();
        }
    }
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
@ -436,10 +436,5 @@ void graph_initializations::run(program& p) {
    }
    set_outputs(p);
    p.get_processing_order().calc_processing_order(p);
-
-    for (auto& node : p.get_processing_order()) {
-        if (!node->is_type<data>())
-            node->get_output_layout();
-    }
 }
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/register.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/register.cpp
@ -65,6 +65,7 @@ void register_implementations() {
    REGISTER_OCL(softmax);
    REGISTER_OCL(space_to_batch);
    REGISTER_OCL(space_to_depth);
+    REGISTER_OCL(slice);
    REGISTER_OCL(strided_slice);
    REGISTER_OCL(tile);
    REGISTER_OCL(lstm_dynamic_input);
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/register.hpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/register.hpp
@ -53,6 +53,7 @@
 #include "intel_gpu/primitives/scatter_nd_update.hpp"
 #include "intel_gpu/primitives/select.hpp"
 #include "intel_gpu/primitives/shuffle_channels.hpp"
+#include "intel_gpu/primitives/slice.hpp"
 #include "intel_gpu/primitives/softmax.hpp"
 #include "intel_gpu/primitives/space_to_batch.hpp"
 #include "intel_gpu/primitives/strided_slice.hpp"
@ -73,7 +74,7 @@ void register_implementations();

 namespace detail {

-#define REGISTER_OCL(prim)              \
+#define REGISTER_OCL(prim)               \
    struct attach_##prim##_impl {        \
        attach_##prim##_impl();          \
    }
@ -130,6 +131,7 @@ REGISTER_OCL(scatter_elements_update);
 REGISTER_OCL(scatter_nd_update);
 REGISTER_OCL(select);
 REGISTER_OCL(shuffle_channels);
+REGISTER_OCL(slice);
 REGISTER_OCL(softmax);
 REGISTER_OCL(space_to_batch);
 REGISTER_OCL(space_to_depth);
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/slice.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/slice.cpp
@ -0,0 +1,138 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <slice_inst.h>
+#include <slice/slice_kernel_ref.h>
+#include <data_inst.h>
+#include <intel_gpu/runtime/error_handler.hpp>
+#include <impls/implementation_map.hpp>
+#include <slice/slice_kernel_selector.h>
+#include "primitive_base.hpp"
+#include <vector>
+#include <algorithm>
+#include <cstddef>
+
+namespace cldnn {
+namespace ocl {
+
+namespace {
+template<typename T, class = typename std::enable_if<std::is_integral<T>::value>::type>
+std::vector<std::int32_t> extractIntegerData(const data_node& node, const stream& stream) {
+    mem_lock<T> lock{node.get_attached_memory_ptr(), stream};
+    T* data = lock.data();
+    std::vector<std::int32_t> integer_data;
+    integer_data.reserve(node.get_output_layout().count());
+    std::copy(data, data + node.get_output_layout().count(), std::back_inserter(integer_data));
+    return integer_data;
+}
+
+std::vector<std::int32_t> extractIntegerData(const data_node& node, const stream& stream) {
+    switch (node.get_output_layout().data_type) {
+    case data_types::u8:
+        return extractIntegerData<std::uint8_t>(node, stream);
+    case data_types::i8:
+        return extractIntegerData<std::int8_t>(node, stream);
+    case data_types::i32:
+        return extractIntegerData<std::int32_t>(node, stream);
+    case data_types::i64:
+        return extractIntegerData<std::int64_t>(node, stream);
+    default:
+        CLDNN_ERROR_DATA_TYPES_MISMATCH(node.id(), "Slice parameter",
+                node.get_output_layout().data_type, "Any integral type",
+                data_types::i32, "Slice parameters should be of integral type.");
+    }
+    return {};
+}
+
+std::vector<std::int32_t> extractShape(kernel_selector::Tensor::DataTensor& tensor) {
+    auto logical_dims = tensor.LogicalDims();
+    // LogicalDims method returns dims in reversed order
+    return {logical_dims.rbegin(), logical_dims.rend()};
+}
+
+} // namespace
+
+struct slice_impl : typed_primitive_impl_ocl<slice> {
+    using parent = typed_primitive_impl_ocl<slice>;
+    using parent::parent;
+
+    enum InputIndices {
+        kData,
+        kStart,
+        kEnd,
+        kStep,
+        kAxes,
+        kInputsNum
+    };
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<slice_impl>(*this);
+    }
+
+    static primitive_impl* create(const slice_node& arg) {
+        auto params = get_default_params<kernel_selector::slice_params>(
+                arg);
+        auto op_params = get_default_optional_params<
+                kernel_selector::slice_optional_params>(
+                arg.get_program());
+        const auto& inputs = arg.get_dependencies();
+        const stream& stream = arg.get_program().get_stream();
+        auto start_elts = extractIntegerData(inputs[InputIndices::kStart]->as<data>(), stream);
+        auto end_elts = extractIntegerData(inputs[InputIndices::kEnd]->as<data>(), stream);
+        auto step_elts = extractIntegerData(inputs[InputIndices::kStep]->as<data>(), stream);
+        auto data_shape = extractShape(params.inputs[0]);
+        std::vector<std::int32_t> axes(data_shape.size());
+        if (inputs.size() == InputIndices::kInputsNum)
+            axes = std::move(extractIntegerData(inputs[InputIndices::kAxes]->as<data>(), stream));
+        else
+            std::iota(axes.begin(), axes.end(), 0);
+        std::vector<std::int32_t> selected_start(data_shape.size(), 0);
+        std::vector<std::int32_t> selected_step(data_shape.size(), 1);
+        std::vector<std::int32_t> selected_end(data_shape);
+        for (int axe = 0; axe < axes.size(); axe++) {
+            auto transformed_axe = axes[axe] < 0 ? data_shape.size() + axes[axe] : axes[axe];
+            auto start = start_elts[axe];
+            auto end = end_elts[axe];
+            auto dim_size = data_shape[transformed_axe];
+            selected_start[transformed_axe] = std::max(std::min(start < 0 ? dim_size + start : start, dim_size - 1), 0);
+            selected_end[transformed_axe] = std::max(std::min(end < 0 ? dim_size + end : end, dim_size - 1), 0);
+            selected_step[transformed_axe] = step_elts[axe];
+        }
+        params.start = std::move(selected_start);
+        params.end = std::move(selected_end);
+        params.step = std::move(selected_step);
+        auto &kernel_selector =
+                kernel_selector::slice_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(params, op_params);
+
+        CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(),
+                "Cannot find a proper kernel with this arguments");
+
+        return new slice_impl(arg, best_kernels[0]);
+    }
+};
+
+namespace detail {
+
+attach_slice_impl::attach_slice_impl() {
+    implementation_map<slice>::add(impl_types::ocl, slice_impl::create, {
+        std::make_tuple(data_types::f16, format::bfyx),
+        std::make_tuple(data_types::f32, format::bfyx),
+        std::make_tuple(data_types::u8, format::bfyx),
+        std::make_tuple(data_types::i8, format::bfyx),
+        std::make_tuple(data_types::i32, format::bfyx),
+        std::make_tuple(data_types::i64, format::bfyx),
+        std::make_tuple(data_types::f16, format::bfzyx),
+        std::make_tuple(data_types::f32, format::bfzyx),
+        std::make_tuple(data_types::u8, format::bfyx),
+        std::make_tuple(data_types::i8, format::bfyx),
+        std::make_tuple(data_types::i32, format::bfzyx),
+        std::make_tuple(data_types::i64, format::bfzyx),
+    });
+}
+
+}  // namespace detail
+
+} // namespace ocl
+} // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/fully_connected_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/fully_connected_onednn.cpp
@ -128,20 +128,6 @@ public:
    static primitive_impl* create(const fully_connected_node& arg) {
        auto& engine = arg.get_program().get_engine();
        auto desc = get_fully_connected_descriptor(arg);
-        auto prim = arg.get_primitive();
-
-        if (prim->input_size == 3) {
-            for (auto& fused_node : arg.get_fused_primitives()) {
-                auto node = fused_node.node;
-                if (node->is_type<eltwise>()) {
-                    auto& dependency = arg.get_dependency(fused_node.dep_start_idx);
-                    auto original_layout = dependency.get_output_layout();
-                    onednn::combine_bf_with_first_spatial_dim(original_layout);
-                    dependency.set_output_layout(original_layout, false);
-                }
-            }
-        }
-
        auto attr = arg.get_onednn_primitive_attributes();
        dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};

--- a/inference-engine/thirdparty/clDNN/src/include/slice_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/slice_inst.h
@ -0,0 +1,38 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <intel_gpu/primitives/slice.hpp>
+#include "primitive_inst.h"
+#include <intel_gpu/runtime/error_handler.hpp>
+
+namespace cldnn {
+
+template <>
+struct typed_program_node<slice> : public typed_program_node_base<slice> {
+    using parent = typed_program_node_base<slice>;
+
+public:
+    using parent::parent;
+
+    program_node& input(std::size_t index = 0) const { return get_dependency(index); }
+};
+
+using slice_node = typed_program_node<slice>;
+
+template <>
+class typed_primitive_inst<slice> : public typed_primitive_inst_base<slice> {
+    using parent = typed_primitive_inst_base<slice>;
+
+public:
+    static layout calc_output_layout(slice_node const& node);
+    static std::string to_string(slice_node const& node);
+
+public:
+    typed_primitive_inst(network& network, slice_node const& desc);
+};
+
+using slice_inst = typed_primitive_inst<slice>;
+
+} // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@ -110,8 +110,18 @@ template <class T>
 static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    auto&& size = mem->get_layout().size;

-    file_stream << "shape: " << size.to_string() << " ";
-    file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+            << ", original shape: " << size.to_string() << ")" << std::endl;
+    }

    mem_lock<T, mem_lock_type::read> lock(mem, stream);
    auto mem_ptr = lock.data();
@ -119,7 +129,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    std::stringstream buffer;

    for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
-        for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) {
+        for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
            for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
                for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
                    for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@ -1225,8 +1225,13 @@ program::primitives_info program::get_current_stage_info() const {

 void program::save_pass_info(std::string pass_name) {
    // TODO: Directory path here can be probably changed to some bool flag
-    if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty())
+    if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty()) {
+        for (auto& node : this->get_processing_order()) {
+            if (!node->is_type<data>())
+                node->get_output_layout();
+        }
        optimizer_passes_info.emplace_back(pass_name, get_current_stage_info());
+    }
 }

 void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,
--- a/inference-engine/thirdparty/clDNN/src/slice.cpp
+++ b/inference-engine/thirdparty/clDNN/src/slice.cpp
@ -0,0 +1,40 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <slice_inst.h>
+#include "primitive_type_base.h"
+#include <sstream>
+#include <json_object.h>
+
+namespace cldnn {
+
+primitive_type_id slice::type_id() {
+    static primitive_type_base<slice> instance;
+    return &instance;
+}
+
+slice_inst::typed_primitive_inst(network& network, slice_node const& node)
+    : parent(network, node) {}
+
+layout slice_inst::calc_output_layout(slice_node const& node) {
+    auto primitive = node.get_primitive();
+    auto input_layout = node.input(0).get_output_layout();
+    return {input_layout.data_type, input_layout.format, primitive->output_shape};
+}
+
+std::string slice_inst::to_string(slice_node const& node) {
+    auto node_info = node.desc_to_json();
+    json_composite slice_info;
+    slice_info.add("input id", node.input().id());
+    slice_info.add("begin_param id", node.get_dependency(1).id());
+    slice_info.add("end_param id", node.get_dependency(2).id());
+    slice_info.add("step_param id", node.get_dependency(3).id());
+    slice_info.add("axis_param id", node.get_dependency(4).id());
+    node_info->add("slice info", slice_info);
+    std::stringstream primitive_description;
+    node_info->dump(primitive_description);
+    return primitive_description.str();
+}
+
+} // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -608,6 +608,7 @@ public:
 #define CASE_FC_U8S8_3D_1 {2, 32, 1, 3}, {2, 32, 1, 16}, {16, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
 #define CASE_FC_U8S8_3D_2 {1, 1, 1, 3}, {1, 1, 1, 32}, {32, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
 #define CASE_FC_U8S8_3D_3 {2, 3, 1, 1}, {2, 3, 1, 15}, {15, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
+#define CASE_FC_U8S8_3D_4 {1, 512, 1, 1024}, {1, 384, 1, 1024}, {1024, 1024, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx

 #define CASE_NORMALIZE_I8_1 {1, 2, 3, 3}, data_types::u8, format::bfyx, data_types::f32, format::bfyx

@ -9258,7 +9259,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_activation_scale_eltwise,
 }));

 #ifdef ENABLE_ONEDNN_FOR_GPU
-class ConvFusingTestOneDNN : public WeightsPrimitiveFusingTest<bc_test_params> {
+class WeightsPrimitiveFusingTestOneDNN : public WeightsPrimitiveFusingTest<bc_test_params> {
 public:
    void execute(bc_test_params& p) {
        // Onednn post operation has issue in a machine that does not support imad.
@ -9299,7 +9300,7 @@ public:
    }
 };

-class conv_int8_eltwise_onednn : public ConvFusingTestOneDNN {};
+class conv_int8_eltwise_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_int8_eltwise_onednn, u8_eltwise_sum_out) {
    auto p = GetParam();

@ -9364,7 +9365,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise_onednn,
                                bc_test_params{CASE_CONV3D_S8S8_5, 3, 4},
                        }));

-class conv_fp32_activation_abs_onednn : public ConvFusingTestOneDNN {};
+class conv_fp32_activation_abs_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_fp32_activation_abs_onednn, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9387,7 +9388,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_abs_onednn,
                                bc_test_params{CASE_CONV_FP16_4, 2, 3},
                          }));

-class conv_fp32_activation_mish_onednn : public ConvFusingTestOneDNN {};
+class conv_fp32_activation_mish_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_fp32_activation_mish_onednn, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9410,7 +9411,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_mish_onednn,
                                bc_test_params{CASE_CONV_FP16_4, 2, 3},
                          }));

-class conv_fp32_activation_swish_onednn : public ConvFusingTestOneDNN {};
+class conv_fp32_activation_swish_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_fp32_activation_swish_onednn, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9433,7 +9434,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_swish_onednn,
                                bc_test_params{CASE_CONV_FP16_4, 2, 3},
                          }));

-class conv_fp32_activation_hswish_onednn : public ConvFusingTestOneDNN {};
+class conv_fp32_activation_hswish_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_fp32_activation_hswish_onednn, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9456,7 +9457,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_hswish_onednn,
                                bc_test_params{CASE_CONV_FP16_4, 2, 3},
                          }));

-class conv_fp32_activation_exp_onednn : public ConvFusingTestOneDNN {};
+class conv_fp32_activation_exp_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_fp32_activation_exp_onednn, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9479,7 +9480,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_exp_onednn,
                                bc_test_params{CASE_CONV_FP16_4, 2, 3},
                          }));

-class conv_int8_quantize_u8_onednn : public ConvFusingTestOneDNN {};
+class conv_int8_quantize_u8_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_int8_quantize_u8_onednn, per_channel) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9526,7 +9527,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_quantize_u8_onednn,
                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
                        }));

-class conv_int8_activation_eltwise_quantize_onednn : public ConvFusingTestOneDNN {};
+class conv_int8_activation_eltwise_quantize_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_int8_activation_eltwise_quantize_onednn, bsv32_fsv32) {
    auto p = GetParam();
    layout eltwise_layout = get_output_layout(p);
@ -9578,7 +9579,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise_quantize_oned
                                bc_test_params{CASE_CONV_S8S8_15, 2, 5},
                        }));

-class conv_int8_scale_shift_swish_onednn : public ConvFusingTestOneDNN {};
+class conv_int8_scale_shift_swish_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_int8_scale_shift_swish_onednn, bsv32_fsv32) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9617,7 +9618,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn,
                                bc_test_params{CASE_CONV_S8S8_15, 2, 7},
                        }));

-class conv_int8_eltwise_scale_onednn : public ConvFusingTestOneDNN {};
+class conv_int8_eltwise_scale_onednn : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(conv_int8_eltwise_scale_onednn, u8_eltwise_prod_out_reuse) {
    auto p = GetParam();

@ -9667,7 +9668,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise_scale_onednn,
 // Limitations: no
 // DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128
 // DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_linear:12.75:-0.5
-class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9718,7 +9719,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_
 // Limitations: beta = 0 in eltw_linear
 // DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512
 // DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512
-class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9769,7 +9770,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_lin
 // Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer
 // DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127
 // DNNL_VERBOSE log with optimization:    attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127
-class post_ops_optimizations_onednn_binary_add_eltw_linear : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_onednn_binary_add_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9820,7 +9821,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_e
 // Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer
 // DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512
 // DNNL_VERBOSE log with optimization:    attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512
-class post_ops_optimizations_onednn_binary_mul_eltw_linear : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_onednn_binary_mul_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9871,7 +9872,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_e
 // Limitations: beta = 0 in eltw_linear
 // DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512
 // DNNL_VERBOSE log with optimization:    attr-oscale:2 attr-post-ops:eltwise_clip:0:512
-class post_ops_optimizations_onednn_oscale_eltw_linear : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_onednn_oscale_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9920,7 +9921,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_
 // Limitations: beta = 0 in eltw_linear
 // DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127
 // DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127
-class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -9970,7 +9971,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum
 // Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi
 // Here's out_lo > out_hi and no optimizations
 // DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127
-class post_ops_optimizations_input_range : public ConvFusingTestOneDNN {};
+class post_ops_optimizations_input_range : public WeightsPrimitiveFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_input_range, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
@ -10015,6 +10016,33 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range,
                                bc_test_params{CASE_CONV_S8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 3},
                        }));
+
+class fc_int8_inputs_fused_fp32_sum : public WeightsPrimitiveFusingTestOneDNN {};
+TEST_P(fc_int8_inputs_fused_fp32_sum, basic) {
+    auto p = GetParam();
+    auto shift_layout = layout{ p.default_type, p.default_format, tensor{1, 1, 1, p.kernel.batch[0]} };
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+                data("weights", get_mem(get_fc_weights_layout(p))),
+                data("bias", get_mem(get_fc_bias_layout(p))),
+                data("shift_data", get_mem(shift_layout, 1)),
+                fully_connected("fc_prim", "input", "weights", "bias", cldnn::data_types::f32, "", padding(), get_fc_output_dim_size(p)),
+                eltwise("shift", {"fc_prim", "shift_data"}, eltwise_mode::sum, cldnn::data_types::f32),
+                crop("crop", "shift", get_output_layout(p).size, {0, 0, 0, 0}),
+                reorder("reorder_bfyx", "crop", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::ValuesIn(std::vector<bc_test_params>{
+                                                                            // OneDNN has issue with small shapes - ticket 7064
+                                                                            // bc_test_params{ CASE_FC_U8S8_3D_1, 2, 4 },
+                                                                            // bc_test_params{ CASE_FC_U8S8_3D_2, 2, 4 },
+                                                                            bc_test_params{ CASE_FC_U8S8_3D_4, 2, 4 },
+
+}));
 #endif


--- a/inference-engine/thirdparty/clDNN/tests/test_cases/slice.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/slice.cpp
@ -0,0 +1,144 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/slice.hpp>
+#include <intel_gpu/primitives/data.hpp>
+
+#include <random>
+#include <algorithm>
+#include <vector>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace {
+
+template<typename T>
+class SliceTest : public ::testing::Test {
+public:
+    static std::vector<T> GenInput(int size) {
+        std::vector<T> result;
+        for (int i = 0; i < size; i++)
+            result.push_back(i);
+        return result;
+    }
+
+    void TearDown() override {
+        assert(input_shape_.size() == 4 || input_shape_.size() == 5);
+        format input_format = input_shape_.size() == 4 ? format::bfyx : format::bfzyx;
+        layout data_layout ( input_type_, input_format, tensor{input_shape_} );
+        std::vector<T> input_vals = GenInput(data_layout.get_linear_size());
+        memory::ptr input = engine_.allocate_memory(data_layout);
+        set_values(input, input_vals);
+        topology topology;
+        topology.add(input_layout("input", input->get_layout()));
+        topology.add(data("start", start_));
+        topology.add(data("stop", stop_));
+        topology.add(data("step", step_));
+        std::vector<primitive_id> inputs {"input", "start", "stop", "step"};
+        if (axes_) {
+            topology.add(data("axes", axes_));
+            inputs.push_back("axes");
+        }
+        topology.add(slice("slice", inputs, tensor{output_shape_}));
+
+        network network(engine_, topology);
+
+        network.set_input_data("input", input);
+
+        auto outputs = network.execute();
+
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "slice");
+
+        auto output = outputs.at("slice").get_memory();
+
+        cldnn::mem_lock<T> output_ptr(output, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), expected_output_.size());
+        for (size_t i = 0; i < output_ptr.size(); ++i)
+            EXPECT_TRUE(are_equal(expected_output_[i], output_ptr[i], 2e-3));
+    }
+
+    data_types DataType() const;
+
+protected:
+    engine& engine_ = get_test_engine();
+    std::vector<std::int32_t> input_shape_;
+    data_types input_type_ {DataType()};
+    memory::ptr start_;
+    memory::ptr stop_;
+    memory::ptr step_;
+    memory::ptr axes_;
+    std::vector<std::int32_t> output_shape_;
+    std::vector<T> expected_output_;
+};
+
+template<>
+data_types SliceTest<float>::DataType() const {return data_types::f32;}
+
+template<>
+data_types SliceTest<int>::DataType() const { return data_types::i32; }
+
+template<>
+data_types SliceTest<long long>::DataType() const { return data_types::i64; }
+
+using testing::Types;
+typedef Types<float, int, long long> DataTypes;
+TYPED_TEST_SUITE(SliceTest, DataTypes);
+
+TYPED_TEST(SliceTest, bfyx_positive_step) {
+    this->input_shape_ = { 1, 2, 100, 12 };
+    this->start_ = this->engine_.allocate_memory({ data_types::i64, format::bfyx, { 4, 1, 1, 1 } });
+    set_values<int64_t>(this->start_, {0, 1, 0, 1});
+    this->stop_ = this->engine_.allocate_memory({ data_types::i64, format::bfyx, { 4, 1, 1, 1 } });
+    set_values<int64_t>(this->stop_, { 1, 2, 5, 100 });
+    this->step_ = this->engine_.allocate_memory({ data_types::i64, format::bfyx, { 4, 1, 1, 1 } });
+    set_values<int64_t>(this->step_, { 1, 1, 1, 10 });
+    this->output_shape_ = { 1, 1, 5, 10 };
+    this->expected_output_ = {
+            1201, 1211, 1221, 1231, 1241, 1301, 1311, 1321, 1331, 1341,
+            1401, 1411, 1421, 1431, 1441, 1501, 1511, 1521, 1531, 1541,
+            1601, 1611, 1621, 1631, 1641, 1701, 1711, 1721, 1731, 1741,
+            1801, 1811, 1821, 1831, 1841, 1901, 1911, 1921, 1931, 1941,
+            2001, 2011, 2021, 2031, 2041, 2101, 2111, 2121, 2131, 2141
+    };
+}
+
+TYPED_TEST(SliceTest, bfyx_negative_step) {
+    this->input_shape_ = { 1, 2, 100, 12 };
+    this->start_ = this->engine_.allocate_memory({ data_types::i64, format::bfyx, { 4, 1, 1, 1 } });
+    set_values<int64_t>(this->start_, { 1, 2, 5, 100 });
+    this->stop_ = this->engine_.allocate_memory({ data_types::i64, format::bfyx, { 4, 1, 1, 1 } });
+    set_values<int64_t>(this->stop_, {0, 1, 0, 1});
+    this->step_ = this->engine_.allocate_memory({ data_types::i64, format::bfyx, { 4, 1, 1, 1 } });
+    set_values<int64_t>(this->step_, { -1, -1, -1, -10 });
+    this->output_shape_ = { 1, 1, 5, 10 };
+    this->expected_output_ = {
+            1799, 1789, 1779, 1769, 1759, 1699, 1689, 1679, 1669, 1659,
+            1599, 1589, 1579, 1569, 1559, 1499, 1489, 1479, 1469, 1459,
+            1399, 1389, 1379, 1369, 1359, 1299, 1289, 1279, 1269, 1259,
+            1199, 1189, 1179, 1169, 1159, 1099, 1089, 1079, 1069, 1059,
+             999,   989,  979, 969,  959,  899,  889,  879,  869,  859
+    };
+}
+
+TYPED_TEST(SliceTest, bfzyx) {
+    this->input_shape_ = { 2, 3, 10, 12, 5 };
+    this->start_ = this->engine_.allocate_memory({ data_types::i64, format::bfzyx, { 5, 1, 1, 1 } });
+    set_values<int64_t>(this->start_, { 0, 0, 0, 0, 0 });
+    this->stop_ = this->engine_.allocate_memory({ data_types::i64, format::bfzyx, { 5, 1, 1, 1 } });
+    set_values<int64_t>(this->stop_, {1, 2, 2, 2, 2});
+    this->step_ = this->engine_.allocate_memory({ data_types::i64, format::bfzyx, { 5, 1, 1, 1 } });
+    set_values<int64_t>(this->step_, { 1, 1, 1, 1, 1 });
+    this->output_shape_ = { 1, 2, 2, 2, 2 };
+    this->expected_output_ = {
+              0,   1,  10,  11, 120, 121, 130, 131,
+            600, 601, 610, 611, 720, 721, 730, 731
+    };
+}
+
+} // anonymous namespace
--- a/inference-engine/thirdparty/mkl-dnn
+++ b/inference-engine/thirdparty/mkl-dnn
@ -1 +1 @@
-Subproject commit 5adbcb757c77f1bf0cd11ad58dd92e93ea2e3561
+Subproject commit acee807d84944008df6741677ab52e01d790d58a
--- a/samples/cpp/benchmark_app/README.md
+++ b/samples/cpp/benchmark_app/README.md
@ -79,11 +79,14 @@ Options:
    -h, --help                  Print a usage message
    -m "<path>"                 Required. Path to an .xml/.onnx/.prototxt file with a trained model or to a .blob files with a trained compiled model.
    -i "<path>"                 Optional. Path to a folder with images and/or binaries or to specific image or binary file.
+                                In case of dynamic shapes networks with several inputs provide the same number of files for each input (except cases with single file for any input):
+                                "input1:1.jpg input2:1.bin", "input1:1.bin,2.bin input2:3.bin input3:4.bin,5.bin ".
+                                Also you can pass specific keys for inputs: "random" - for fillling input with random data, "image_info" - for filling input with image size.
    -d "<device>"               Optional. Specify a target device to infer on (the list of available devices is shown below). Default value is CPU.
                                Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin.
                                Use "-d MULTI:<comma-separated_devices_list>" format to specify MULTI plugin.
                                Use "-d GPU.X" format to specify device id for GPU devices.
-    The application looks for a suitable plugin for the specified device.
+                                The application looks for a suitable plugin for the specified device.
    -l "<absolute_path>"        Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
          Or
    -c "<absolute_path>"        Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.
@ -99,11 +102,23 @@ Options:
    -stream_output              Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a multiline output.
    -t                          Optional. Time, in seconds, to execute topology.
    -progress                   Optional. Show progress bar (can affect performance measurement). Default values is "false".
-    -shape                      Optional. Set shape for input. For example, "input1[1,3,224,224],input2[1,4]" or "[1,3,224,224]" in case of one input size.
+    -shape                      Optional. Set shape for network input. For example, "input1[1,3,224,224],input2[1,4]" or "[1,3,224,224]" in case of one input size.
+                                This parameter affect model input shape and can be dynamic. For dynamic dimensions use symbol `?` or '-1'. Ex. [?,3,?,?].
+                                For bounded dimensions specify range 'min..max'. Ex. [1..10,3,?,?].
+    -data_shape                 Required for networks with dynamic shapes. Set shape for input blobs.
+                                In case of one input size: "[1,3,224,224]" or "input1[1,3,224,224],input2[1,4]".
+                                In case of several input sizes provide the same number for
+                                each input (except cases with single shape for any input): "[1,3,128,128][3,3,128,128][1,3,320,320]",
+                                "input1[1,1,128,128][1,1,256,256],input2[80,1]" or "input1[1,192][1,384],input2[1,192][1,384],input3[1,192][1,384],input4[1,192][1,384]".
+                                If network shapes are all static specifying the option will cause an exception.
    -layout                     Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
    -cache_dir "<path>"         Optional. Enables caching of loaded models to specified directory.
    -load_from_file             Optional. Loads model from file directly without ReadNetwork.
    -latency_percentile         Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).
+    -inference_only             Optional. Measure only inference stage. Default option for static models.
+                                Dynamic models are measured in full mode which includes inputs setup stage,
+                                inference only mode available for them with single input data shape only.
+                                To enable full mode for static models pass \"false\" value to this argument: ex. -inference_only=false".

  CPU-specific performance options:
    -nstreams "<integer>"       Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices
@ -117,16 +132,19 @@ Options:
    -enforcebf16="<true/false>" Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform.
    -pin "YES"/"HYBRID_AWARE"/"NUMA"/"NO"
                                Optional. Explicit inference threads binding options (leave empty to let the OpenVINO to make a choice):
-					            enabling threads->cores pinning ("YES", which is already default for a conventional CPU),
-			                    letting the runtime to decide on the threads->different core types ("HYBRID_AWARE", which is default on the hybrid CPUs)
-			                    threads->(NUMA)nodes ("NUMA") or
-			      	            completely disable ("NO") CPU inference threads pinning.
+                                enabling threads->cores pinning ("YES", which is already default for a conventional CPU),
+                                letting the runtime to decide on the threads->different core types ("HYBRID_AWARE", which is default on the hybrid CPUs)
+                                threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU inference threads pinning.
    -ip "U8"/"FP16"/"FP32"      Optional. Specifies precision for all input layers of the network.
    -op "U8"/"FP16"/"FP32"      Optional. Specifies precision for all output layers of the network.
-    -iop                        Optional. Specifies precision for input and output layers by name. Example: -iop "input:FP16, output:FP16". Notice that quotes are required. Overwrites precision from ip and op options for specified layers.
+    -iop                        Optional. Specifies precision for input and output layers by name. Example: -iop "input:FP16, output:FP16". Notice that quotes are required.
+                                Overwrites precision from ip and op options for specified layers.

  Statistics dumping options:
-    -report_type "<type>"       Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request.
+    -report_type "<type>"       Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency.
+                                "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network.
+                                "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters
+                                and latency for each executed infer request.
    -report_folder              Optional. Path to a folder where statistics report is stored.
    -exec_graph_path            Optional. Path to a file where to store executable graph information serialized.
    -pc                         Optional. Report performance counters.
@ -181,33 +199,55 @@ This section provides step-by-step instructions on how to run the Benchmark Tool
 The application outputs the number of executed iterations, total duration of execution, latency, and throughput.
 Additionally, if you set the `-report_type` parameter, the application outputs statistics report. If you set the `-pc` parameter, the application outputs performance counters. If you set `-exec_graph_path`, the application reports executable graph information serialized. All measurements including per-layer PM counters are reported in milliseconds.

-Below are fragments of sample output for CPU and GPU devices:
+Below are fragments of sample output static and dynamic networks:

-* For CPU:
+* For static network:
   ```
-   [Step 8/9] Measuring performance (Start inference asynchronously, 60000 ms duration, 4 inference requests in parallel using 4 streams)
-   Progress: [....................] 100.00% done
-
-   [Step 9/9] Dumping statistics report
-   [ INFO ] Statistics collecting was not requested. No reports are dumped.
-   Progress: [....................] 100.00% done
-
-   Count:      4612 iterations
-   Duration:   60110.04 ms
-   Latency:    50.99 ms
-   Throughput: 76.73 FPS
-   ```
-
-* For GPU:
-   ```
-   [Step 10/11] Measuring performance (Start inference asynchronously, 5 inference requests using 4 streams for CPU, limits: 120000 ms duration)
-   Progress: [....................] 100% done
+   [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests using 4 streams for CPU, limits: 60000 ms duration)
+   [ INFO ] BENCHMARK IS IN INFERENCE ONLY MODE.
+   [ INFO ] Input blobs will be filled once before performance measurements.
+   [ INFO ] First inference took 26.26 ms
+   Progress: [................... ]  99% done

   [Step 11/11] Dumping statistics report
-   Count:      102515 iterations
-   Duration:   120007.38 ms
-   Latency:    5.84 ms
-   Throughput: 854.24 FP
+   [ INFO ] Count:      6640 iterations
+   [ INFO ] Duration:   60039.70 ms
+   [ INFO ] Latency:
+   [ INFO ]        Median:  35.36 ms
+   [ INFO ]        Avg:    36.12 ms
+   [ INFO ]        Min:    18.55 ms
+   [ INFO ]        Max:    88.96 ms
+   [ INFO ] Throughput: 110.59 FPS
+   ```
+
+* For dynamic network:
+   ```
+   [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests using 4 streams for CPU, limits: 60000 ms duration)
+   [ INFO ] BENCHMARK IS IN FULL MODE.
+   [ INFO ] Inputs setup stage will be included in performance measurements.
+   [ INFO ] First inference took 26.80 ms
+   Progress: [................... ]  99% done
+
+   [Step 11/11] Dumping statistics report
+   [ INFO ] Count:      5199 iterations
+   [ INFO ] Duration:   60043.34 ms
+   [ INFO ] Latency:
+   [ INFO ]        Median:  41.58 ms
+   [ INFO ]        Avg:    46.07 ms
+   [ INFO ]        Min:    8.44 ms
+   [ INFO ]        Max:    115.65 ms
+   [ INFO ] Latency for each data shape group:
+   [ INFO ] 1. data : [1, 3, 224, 224]
+   [ INFO ]        Median:  38.37 ms
+   [ INFO ]        Avg:    30.29 ms
+   [ INFO ]        Min:    8.44 ms
+   [ INFO ]        Max:    61.30 ms
+   [ INFO ] 2. data : [1, 3, 448, 448]
+   [ INFO ]        Median:  68.21 ms
+   [ INFO ]        Avg:    61.85 ms
+   [ INFO ]        Min:    29.58 ms
+   [ INFO ]        Max:    115.65 ms
+   [ INFO ] Throughput: 86.59 FPS
   ```

 ## See Also
--- a/samples/cpp/benchmark_app/benchmark_app.hpp
+++ b/samples/cpp/benchmark_app/benchmark_app.hpp
@ -19,7 +19,12 @@ static const char help_message[] = "Print a usage message";

 /// @brief message for images argument
 static const char input_message[] =
-    "Optional. Path to a folder with images and/or binaries or to specific image or binary file.";
+    "Optional. Path to a folder with images and/or binaries or to specific image or binary file.\n"
+    "                              In case of dynamic shapes networks with several inputs provide the same number"
+    " of files for each input (except cases with single file for any input):"
+    "\"input1:1.jpg input2:1.bin\", \"input1:1.bin,2.bin input2:3.bin input3:4.bin,5.bin \"."
+    " Also you can pass specific keys for inputs: \"random\" - for fillling input with random data,"
+    " \"image_info\" - for filling input with image size.";

 /// @brief message for model argument
 static const char model_message[] =
@ -136,6 +141,9 @@ static const char progress_message[] =
 // @brief message for performance counters option
 static const char pc_message[] = "Optional. Report performance counters.";

+// @brief message for performance counters for sequence option
+static const char pcseq_message[] = "Optional. Report latencies for each shape in -data_shape sequence.";
+
 #ifdef HAVE_DEVICE_MEM_SUPPORT
 // @brief message for switching memory allocation type option
 static const char use_device_mem_message[] =
@ -155,9 +163,19 @@ static const char dump_config_message[] =
 #endif

 static const char shape_message[] =
-    "Optional. Set shape for input. For example, \"input1[1,3,224,224],input2[1,4]\" or "
-    "\"[1,3,224,224]\""
-    " in case of one input size.";
+    "Optional. Set shape for network input. For example, \"input1[1,3,224,224],input2[1,4]\" or \"[1,3,224,224]\""
+    " in case of one input size. This parameter affect model input shape and can be dynamic."
+    " For dynamic dimensions use symbol `?` or '-1'. Ex. [?,3,?,?]."
+    " For bounded dimensions specify range 'min..max'. Ex. [1..10,3,?,?].";
+
+static const char data_shape_message[] =
+    " Required for networks with dynamic shapes. Set shape for input blobs."
+    " In case of one input size: \"[1,3,224,224]\" or \"input1[1,3,224,224],input2[1,4]\"."
+    " In case of several input sizes provide the same number for each input (except cases with single shape for any "
+    "input):"
+    " \"[1,3,128,128][3,3,128,128][1,3,320,320]\", \"input1[1,1,128,128][1,1,256,256],input2[80,1]\""
+    " or \"input1[1,192][1,384],input2[1,192][1,384],input3[1,192][1,384],input4[1,192][1,384]\"."
+    " If network shapes are all static specifying the option will cause an exception.";

 static const char layout_message[] =
    "Optional. Prompts how network layouts should be treated by application. "
@ -196,6 +214,13 @@ static constexpr char input_image_mean_message[] =
    "Values to be provided in the [R, G, B] format. Can be defined for desired input of the model,\n"
    "Example: -imean data[255,255,255],info[255,255,255]\n";

+static constexpr char inference_only_message[] =
+    "Optional. Measure only inference stage. Default option for static models. Dynamic models"
+    " are measured in full mode which includes inputs setup stage,"
+    " inference only mode available for them with single input data shape only."
+    " To enable full mode for static models pass \"false\" value to this argument:"
+    " ex. \"-inference_only=false\".\n";
+
 /// @brief Define flag for showing help message <br>
 DEFINE_bool(h, false, help_message);

@ -276,6 +301,9 @@ DEFINE_bool(progress, false, progress_message);
 /// @brief Define flag for showing performance counters <br>
 DEFINE_bool(pc, false, pc_message);

+/// @brief Define flag for showing performance sequence counters <br>
+DEFINE_bool(pcseq, false, pcseq_message);
+
 #ifdef HAVE_DEVICE_MEM_SUPPORT
 /// @brief Define flag for switching beetwen host and device memory allocation for input and output buffers
 DEFINE_bool(use_device_mem, false, use_device_mem_message);
@ -292,6 +320,9 @@ DEFINE_string(dump_config, "", dump_config_message);
 /// @brief Define flag for input shape <br>
 DEFINE_string(shape, "", shape_message);

+/// @brief Define flag for input blob shape <br>
+DEFINE_string(data_shape, "", data_shape_message);
+
 /// @brief Define flag for layout shape <br>
 DEFINE_string(layout, "", layout_message);

@ -322,6 +353,9 @@ DEFINE_string(iscale, "", input_image_scale_message);
 /// @brief Define flag for using input image mean <br>
 DEFINE_string(imean, "", input_image_mean_message);

+/// @brief Define flag for inference only mode <br>
+DEFINE_bool(inference_only, true, inference_only_message);
+
 /**
 * @brief This function show a help message
 */
@ -346,8 +380,9 @@ static void showUsage() {
    std::cout << "    -t                        " << execution_time_message << std::endl;
    std::cout << "    -progress                 " << progress_message << std::endl;
    std::cout << "    -shape                    " << shape_message << std::endl;
+    std::cout << "    -data_shape             " << data_shape_message << std::endl;
    std::cout << "    -layout                   " << layout_message << std::endl;
-    std::cout << "    -cache_dir \"<path>\"        " << cache_dir_message << std::endl;
+    std::cout << "    -cache_dir \"<path>\"       " << cache_dir_message << std::endl;
    std::cout << "    -load_from_file           " << load_from_file_message << std::endl;
    std::cout << "    -latency_percentile       " << infer_latency_percentile_message << std::endl;
    std::cout << std::endl << "  device-specific performance options:" << std::endl;
@ -363,6 +398,7 @@ static void showUsage() {
    std::cout << "    -report_folder            " << report_folder_message << std::endl;
    std::cout << "    -exec_graph_path          " << exec_graph_path_message << std::endl;
    std::cout << "    -pc                       " << pc_message << std::endl;
+    std::cout << "    -pcseq                    " << pcseq_message << std::endl;
 #ifdef USE_OPENCV
    std::cout << "    -dump_config              " << dump_config_message << std::endl;
    std::cout << "    -load_config              " << load_config_message << std::endl;
@ -373,4 +409,5 @@ static void showUsage() {
    std::cout << "    -iop                        \"<value>\"    " << iop_message << std::endl;
    std::cout << "    -iscale                    " << input_image_scale_message << std::endl;
    std::cout << "    -imean                     " << input_image_mean_message << std::endl;
+    std::cout << "    -inference_only              " << inference_only_message << std::endl;
 }
--- a/samples/cpp/benchmark_app/infer_request_wrap.hpp
+++ b/samples/cpp/benchmark_app/infer_request_wrap.hpp
@ -18,13 +18,12 @@
 // clang-format off
 #include "inference_engine.hpp"

+#include "remote_blobs_filling.hpp"
 #include "statistics_report.hpp"
+#include "utils.hpp"
 // clang-format on

-typedef std::chrono::high_resolution_clock Time;
-typedef std::chrono::nanoseconds ns;
-
-typedef std::function<void(size_t id, const double latency)> QueueCallbackFunction;
+typedef std::function<void(size_t id, size_t group_id, const double latency)> QueueCallbackFunction;

 /// @brief Wrapper class for InferenceEngine::InferRequest. Handles asynchronous callbacks and calculates execution
 /// time.
@ -37,10 +36,12 @@ public:
    explicit InferReqWrap(InferenceEngine::ExecutableNetwork& net, size_t id, QueueCallbackFunction callbackQueue)
        : _request(net.CreateInferRequest()),
          _id(id),
-          _callbackQueue(callbackQueue) {
+          _lat_group_id(0),
+          _callbackQueue(callbackQueue),
+          outputClBuffer() {
        _request.SetCompletionCallback([&]() {
            _endTime = Time::now();
-            _callbackQueue(_id, getExecutionTimeInMilliseconds());
+            _callbackQueue(_id, _lat_group_id, getExecutionTimeInMilliseconds());
        });
    }

@ -57,7 +58,7 @@ public:
        _startTime = Time::now();
        _request.Infer();
        _endTime = Time::now();
-        _callbackQueue(_id, getExecutionTimeInMilliseconds());
+        _callbackQueue(_id, _lat_group_id, getExecutionTimeInMilliseconds());
    }

    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> getPerformanceCounts() {
@ -77,26 +78,48 @@ public:
        return static_cast<double>(execTime.count()) * 0.000001;
    }

+    void setLatencyGroupId(size_t id) {
+        _lat_group_id = id;
+    }
+
+    // in case of using GPU memory we need to allocate CL buffer for
+    // output blobs. By encapsulating cl buffer inside InferReqWrap
+    // we will control the number of output buffers and access to it.
+    std::map<std::string, ::gpu::BufferType>& getOutputClBuffer() {
+        return outputClBuffer;
+    }
+
 private:
    InferenceEngine::InferRequest _request;
    Time::time_point _startTime;
    Time::time_point _endTime;
    size_t _id;
+    size_t _lat_group_id;
    QueueCallbackFunction _callbackQueue;
+    std::map<std::string, ::gpu::BufferType> outputClBuffer;
 };

 class InferRequestsQueue final {
 public:
-    InferRequestsQueue(InferenceEngine::ExecutableNetwork& net, size_t nireq) {
+    InferRequestsQueue(InferenceEngine::ExecutableNetwork& net,
+                       size_t nireq,
+                       size_t lat_group_n,
+                       bool enable_lat_groups)
+        : enable_lat_groups(enable_lat_groups) {
        for (size_t id = 0; id < nireq; id++) {
-            requests.push_back(std::make_shared<InferReqWrap>(
-                net,
-                id,
-                std::bind(&InferRequestsQueue::putIdleRequest, this, std::placeholders::_1, std::placeholders::_2)));
+            requests.push_back(std::make_shared<InferReqWrap>(net,
+                                                              id,
+                                                              std::bind(&InferRequestsQueue::putIdleRequest,
+                                                                        this,
+                                                                        std::placeholders::_1,
+                                                                        std::placeholders::_2,
+                                                                        std::placeholders::_3)));
            _idleIds.push(id);
        }
+        _latency_groups.resize(lat_group_n);
        resetTimes();
    }
+
    ~InferRequestsQueue() {
        // Inference Request guarantee that it will wait for all asynchronous internal tasks in destructor
        // So it should be released before any context that the request can use inside internal asynchronous tasks
@ -111,15 +134,21 @@ public:
        _startTime = Time::time_point::max();
        _endTime = Time::time_point::min();
        _latencies.clear();
+        for (auto& group : _latency_groups) {
+            group.clear();
+        }
    }

    double getDurationInMilliseconds() {
        return std::chrono::duration_cast<ns>(_endTime - _startTime).count() * 0.000001;
    }

-    void putIdleRequest(size_t id, const double latency) {
+    void putIdleRequest(size_t id, size_t lat_group_id, const double latency) {
        std::unique_lock<std::mutex> lock(_mutex);
        _latencies.push_back(latency);
+        if (enable_lat_groups) {
+            _latency_groups[lat_group_id].push_back(latency);
+        }
        _idleIds.push(id);
        _endTime = std::max(Time::now(), _endTime);
        _cv.notify_one();
@ -147,6 +176,10 @@ public:
        return _latencies;
    }

+    std::vector<std::vector<double>> getLatencyGroups() {
+        return _latency_groups;
+    }
+
    std::vector<InferReqWrap::Ptr> requests;

 private:
@ -156,4 +189,6 @@ private:
    Time::time_point _startTime;
    Time::time_point _endTime;
    std::vector<double> _latencies;
+    std::vector<std::vector<double>> _latency_groups;
+    bool enable_lat_groups;
 };
--- a/samples/cpp/benchmark_app/inputs_filling.cpp
+++ b/samples/cpp/benchmark_app/inputs_filling.cpp
@ -3,7 +3,10 @@
 //

 #include <algorithm>
+#include <fstream>
+#include <iomanip>
 #include <memory>
+#include <random>
 #include <string>
 #include <utility>
 #include <vector>
@ -13,6 +16,8 @@
 #include "format_reader_ptr.h"

 #include "inputs_filling.hpp"
+#include "shared_blob_allocator.hpp"
+#include "utils.hpp"
 // clang-format on

 using namespace InferenceEngine;
@ -42,207 +47,464 @@ std::vector<std::string> filterFilesByExtensions(const std::vector<std::string>&
    return filtered;
 }

-template <typename T>
-void fillBlobImage(Blob::Ptr& inputBlob,
-                   const std::vector<std::string>& filePaths,
-                   const size_t& batchSize,
-                   const benchmark_app::InputInfo& app_info,
-                   const size_t& requestId,
-                   const size_t& inputId,
-                   const size_t& inputSize) {
-    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
-    if (!minput) {
-        IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in "
-                      "fillBlobImage, "
-                   << "but by fact we were not able to cast inputBlob to MemoryBlob";
-    }
-    // locked memory holder should be alive all time while access to its buffer
-    // happens
-    auto minputHolder = minput->wmap();
-    auto inputBlobData = minputHolder.as<T*>();
-
-    /** Collect images data ptrs **/
-    std::vector<std::shared_ptr<uint8_t>> vreader;
-    vreader.reserve(batchSize);
-
-    for (size_t i = 0ULL, inputIndex = requestId * batchSize * inputSize + inputId; i < batchSize;
-         i++, inputIndex += inputSize) {
-        inputIndex %= filePaths.size();
-
-        slog::info << "Prepare image " << filePaths[inputIndex] << slog::endl;
-        FormatReader::ReaderPtr reader(filePaths[inputIndex].c_str());
-        if (reader.get() == nullptr) {
-            slog::warn << "Image " << filePaths[inputIndex] << " cannot be read!" << slog::endl << slog::endl;
-            continue;
-        }
-
-        /** Getting image data **/
-        std::shared_ptr<uint8_t> imageData(reader->getData(app_info.width(), app_info.height()));
-        if (imageData) {
-            vreader.push_back(imageData);
-        }
-    }
-
-    /** Fill input tensor with images. First b channel, then g and r channels **/
-    const size_t numChannels = app_info.channels();
-    const size_t width = app_info.width();
-    const size_t height = app_info.height();
-    /** Iterate over all input images **/
-    for (size_t imageId = 0; imageId < vreader.size(); ++imageId) {
-        /** Iterate over all width **/
-        for (size_t w = 0; w < app_info.width(); ++w) {
-            /** Iterate over all height **/
-            for (size_t h = 0; h < app_info.height(); ++h) {
-                /** Iterate over all channels **/
-                for (size_t ch = 0; ch < numChannels; ++ch) {
-                    /**          [images stride + channels stride + pixel id ] all in
-                     * bytes            **/
-                    size_t offset = imageId * numChannels * width * height +
-                                    (((app_info.layout == "NCHW") || (app_info.layout == "CHW"))
-                                         ? (ch * width * height + h * width + w)
-                                         : (h * width * numChannels + w * numChannels + ch));
-                    inputBlobData[offset] =
-                        (static_cast<T>(vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch]) -
-                         static_cast<T>(app_info.mean[ch])) /
-                        static_cast<T>(app_info.scale[ch]);
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void fillBlobBinary(Blob::Ptr& inputBlob,
-                    const std::vector<std::string>& filePaths,
-                    const size_t& batchSize,
-                    const size_t& requestId,
-                    const size_t& inputId,
-                    const size_t& inputSize) {
-    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
-    auto adjBatchSize = batchSize;
-
-    // Check layout
-    std::stringstream ss;
-    auto tensorDesc = inputBlob->getTensorDesc();
-    ss << tensorDesc.getLayout();
-    auto layout = ss.str();
-    std::size_t batchIndex = layout.find("N");
-    if (batchIndex == std::string::npos) {
-        adjBatchSize = 1;
-    } else if (tensorDesc.getDims().at(batchIndex) != batchSize) {
-        adjBatchSize = tensorDesc.getDims().at(batchIndex);
-    }
-
-    if (!minput) {
-        IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in "
-                      "fillBlobBinary, "
-                   << "but by fact we were not able to cast inputBlob to MemoryBlob";
-    }
-    // locked memory holder should be alive all time while access to its buffer
-    // happens
-    auto minputHolder = minput->wmap();
-
-    auto inputBlobData = minputHolder.as<char*>();
-    for (size_t i = 0ULL, inputIndex = requestId * adjBatchSize * inputSize + inputId; i < adjBatchSize;
-         i++, inputIndex += inputSize) {
-        inputIndex %= filePaths.size();
-
-        slog::info << "Prepare binary file " << filePaths[inputIndex] << slog::endl;
-        std::ifstream binaryFile(filePaths[inputIndex], std::ios_base::binary | std::ios_base::ate);
-        if (!binaryFile) {
-            IE_THROW() << "Cannot open " << filePaths[inputIndex];
-        }
-
-        auto fileSize = static_cast<std::size_t>(binaryFile.tellg());
-        binaryFile.seekg(0, std::ios_base::beg);
-        if (!binaryFile.good()) {
-            IE_THROW() << "Can not read " << filePaths[inputIndex];
-        }
-        auto inputSize = inputBlob->size() * sizeof(T) / adjBatchSize;
-        if (fileSize != inputSize) {
-            IE_THROW() << "File " << filePaths[inputIndex] << " contains " << std::to_string(fileSize)
-                       << " bytes "
-                          "but the network expects "
-                       << std::to_string(inputSize);
-        }
-        binaryFile.read(&inputBlobData[i * inputSize], inputSize);
-    }
-}
-
 template <typename T>
 using uniformDistribution = typename std::conditional<
    std::is_floating_point<T>::value,
    std::uniform_real_distribution<T>,
    typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;

-template <typename T, typename T2>
-void fillBlobRandom(Blob::Ptr& inputBlob,
-                    T rand_min = std::numeric_limits<uint8_t>::min(),
-                    T rand_max = std::numeric_limits<uint8_t>::max()) {
-    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
-    if (!minput) {
-        IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in "
-                      "fillBlobRandom, "
-                   << "but by fact we were not able to cast inputBlob to MemoryBlob";
-    }
-    // locked memory holder should be alive all time while access to its buffer
-    // happens
-    auto minputHolder = minput->wmap();
+template <typename T>
+InferenceEngine::Blob::Ptr createBlobFromImage(const std::vector<std::string>& files,
+                                               size_t inputId,
+                                               size_t batchSize,
+                                               const benchmark_app::InputInfo& inputInfo,
+                                               std::string* filenames_used = nullptr) {
+    size_t blob_size =
+        std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies<int>());
+    T* data = new T[blob_size];

-    auto inputBlobData = minputHolder.as<T*>();
-    std::mt19937 gen(0);
-    uniformDistribution<T2> distribution(rand_min, rand_max);
-    for (size_t i = 0; i < inputBlob->size(); i++) {
-        inputBlobData[i] = static_cast<T>(distribution(gen));
+    /** Collect images data ptrs **/
+    std::vector<std::shared_ptr<uint8_t>> vreader;
+    vreader.reserve(batchSize);
+
+    for (size_t b = 0; b < batchSize; ++b) {
+        auto inputIndex = (inputId + b) % files.size();
+        if (filenames_used) {
+            *filenames_used += (filenames_used->empty() ? "" : ", ") + files[inputIndex];
+        }
+        FormatReader::ReaderPtr reader(files[inputIndex].c_str());
+        if (reader.get() == nullptr) {
+            slog::warn << "Image " << files[inputIndex] << " cannot be read!" << slog::endl << slog::endl;
+            continue;
+        }
+
+        /** Getting image data **/
+        std::shared_ptr<uint8_t> imageData(reader->getData(inputInfo.width(), inputInfo.height()));
+        if (imageData) {
+            vreader.push_back(imageData);
+        }
    }
+
+    /** Fill input tensor with image. First b channel, then g and r channels **/
+    const size_t numChannels = inputInfo.channels();
+    const size_t width = inputInfo.width();
+    const size_t height = inputInfo.height();
+    /** Iterate over all input images **/
+    for (size_t b = 0; b < batchSize; ++b) {
+        /** Iterate over all width **/
+        for (size_t w = 0; w < width; ++w) {
+            /** Iterate over all height **/
+            for (size_t h = 0; h < height; ++h) {
+                /** Iterate over all channels **/
+                for (size_t ch = 0; ch < numChannels; ++ch) {
+                    /**          [images stride + channels stride + pixel id ] all in
+                     * bytes            **/
+                    size_t offset = b * numChannels * width * height +
+                                    (((inputInfo.layout == "NCHW") || (inputInfo.layout == "CHW"))
+                                         ? (ch * width * height + h * width + w)
+                                         : (h * width * numChannels + w * numChannels + ch));
+                    data[offset] =
+                        (static_cast<T>(vreader.at(b).get()[h * width * numChannels + w * numChannels + ch]) -
+                         static_cast<T>(inputInfo.mean[ch])) /
+                        static_cast<T>(inputInfo.scale[ch]);
+                }
+            }
+        }
+    }
+
+    InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout);
+    auto blob =
+        InferenceEngine::make_shared_blob<T>(tDesc,
+                                             std::make_shared<SharedBlobAllocator<T>>(data, blob_size * sizeof(T)));
+    blob->allocate();
+    return blob;
 }

 template <typename T>
-void fillBlobImInfo(Blob::Ptr& inputBlob, const size_t& batchSize, std::pair<size_t, size_t> image_size) {
-    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
-    if (!minput) {
-        IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in "
-                      "fillBlobImInfo, "
-                   << "but by fact we were not able to cast inputBlob to MemoryBlob";
-    }
-    // locked memory holder should be alive all time while access to its buffer
-    // happens
-    auto minputHolder = minput->wmap();
+InferenceEngine::Blob::Ptr createBlobImInfo(const std::pair<size_t, size_t>& image_size,
+                                            size_t batchSize,
+                                            const benchmark_app::InputInfo& inputInfo) {
+    size_t blob_size =
+        std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies<int>());
+    T* data = new T[blob_size];

-    auto inputBlobData = minputHolder.as<T*>();
    for (size_t b = 0; b < batchSize; b++) {
-        size_t iminfoSize = inputBlob->size() / batchSize;
+        size_t iminfoSize = blob_size / batchSize;
        for (size_t i = 0; i < iminfoSize; i++) {
            size_t index = b * iminfoSize + i;
            if (0 == i)
-                inputBlobData[index] = static_cast<T>(image_size.first);
+                data[index] = static_cast<T>(image_size.first);
            else if (1 == i)
-                inputBlobData[index] = static_cast<T>(image_size.second);
+                data[index] = static_cast<T>(image_size.second);
            else
-                inputBlobData[index] = 1;
+                data[index] = 1;
        }
    }
+
+    InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout);
+    InferenceEngine::Blob::Ptr blob =
+        InferenceEngine::make_shared_blob<T>(tDesc,
+                                             std::make_shared<SharedBlobAllocator<T>>(data, blob_size * sizeof(T)));
+    blob->allocate();
+    return blob;
+}
+
+template <typename T>
+InferenceEngine::Blob::Ptr createBlobFromBinary(const std::vector<std::string>& files,
+                                                size_t inputId,
+                                                size_t batchSize,
+                                                const benchmark_app::InputInfo& inputInfo,
+                                                std::string* filenames_used = nullptr) {
+    size_t blob_size =
+        std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies<int>());
+    char* data = new char[blob_size * sizeof(T)];
+
+    // adjust batch size
+    std::stringstream ss;
+    ss << inputInfo.originalLayout;
+    std::string layout = ss.str();
+    if (layout.find("N") == std::string::npos) {
+        batchSize = 1;
+    } else if (inputInfo.batch() != batchSize) {
+        batchSize = inputInfo.batch();
+    }
+
+    for (size_t b = 0; b < batchSize; ++b) {
+        size_t inputIndex = (inputId + b) % files.size();
+        std::ifstream binaryFile(files[inputIndex], std::ios_base::binary | std::ios_base::ate);
+        if (!binaryFile) {
+            IE_THROW() << "Cannot open " << files[inputIndex];
+        }
+
+        auto fileSize = static_cast<std::size_t>(binaryFile.tellg());
+        binaryFile.seekg(0, std::ios_base::beg);
+        if (!binaryFile.good()) {
+            IE_THROW() << "Can not read " << files[inputIndex];
+        }
+        auto inputSize = blob_size * sizeof(T) / batchSize;
+        if (fileSize != inputSize) {
+            IE_THROW() << "File " << files[inputIndex] << " contains " << std::to_string(fileSize)
+                       << " bytes "
+                          "but the network expects "
+                       << std::to_string(inputSize);
+        }
+
+        if (inputInfo.layout != "CN") {
+            binaryFile.read(&data[b * inputSize], inputSize);
+        } else {
+            for (int i = 0; i < inputInfo.channels(); i++) {
+                binaryFile.read(&data[(i * batchSize + b) * sizeof(T)], sizeof(T));
+            }
+        }
+
+        if (filenames_used) {
+            *filenames_used += (filenames_used->empty() ? "" : ", ") + files[inputIndex];
+        }
+    }
+
+    InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout);
+    InferenceEngine::Blob::Ptr blob =
+        InferenceEngine::make_shared_blob<T>(tDesc,
+                                             std::make_shared<SharedBlobAllocator<T>>((T*)data, blob_size * sizeof(T)));
+    blob->allocate();
+    return blob;
+}
+
+template <typename T, typename T2>
+InferenceEngine::Blob::Ptr createBlobRandom(const benchmark_app::InputInfo& inputInfo,
+                                            T rand_min = std::numeric_limits<uint8_t>::min(),
+                                            T rand_max = std::numeric_limits<uint8_t>::max()) {
+    size_t blob_size =
+        std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies<int>());
+    T* data = new T[blob_size];
+
+    std::mt19937 gen(0);
+    uniformDistribution<T2> distribution(rand_min, rand_max);
+    for (size_t i = 0; i < blob_size; i++) {
+        data[i] = static_cast<T>(distribution(gen));
+    }
+
+    InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout);
+    InferenceEngine::Blob::Ptr blob =
+        InferenceEngine::make_shared_blob<T>(tDesc,
+                                             std::make_shared<SharedBlobAllocator<T>>(data, blob_size * sizeof(T)));
+    blob->allocate();
+    return blob;
+}
+
+InferenceEngine::Blob::Ptr getImageBlob(const std::vector<std::string>& files,
+                                        size_t inputId,
+                                        size_t batchSize,
+                                        const std::pair<std::string, benchmark_app::InputInfo>& inputInfo,
+                                        std::string* filenames_used = nullptr) {
+    auto precision = inputInfo.second.precision;
+    if (precision == InferenceEngine::Precision::FP32) {
+        return createBlobFromImage<float>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::FP16) {
+        return createBlobFromImage<short>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::I32) {
+        return createBlobFromImage<int32_t>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::I64) {
+        return createBlobFromImage<int64_t>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::U8) {
+        return createBlobFromImage<uint8_t>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else {
+        IE_THROW() << "Input precision is not supported for " << inputInfo.first;
+    }
 }

-void fillBlobs(const std::vector<std::string>& inputFiles,
-               const size_t& batchSize,
-               benchmark_app::InputsInfo& app_inputs_info,
-               std::vector<InferReqWrap::Ptr> requests) {
-    std::vector<std::pair<size_t, size_t>> input_image_sizes;
-    for (auto& item : app_inputs_info) {
-        if (item.second.isImage()) {
-            input_image_sizes.push_back(std::make_pair(item.second.width(), item.second.height()));
-        }
-        slog::info << "Network input '" << item.first << "' precision " << item.second.precision << ", dimensions ("
-                   << item.second.layout << "): ";
-        for (const auto& i : item.second.shape) {
-            slog::info << i << " ";
-        }
-        slog::info << slog::endl;
+InferenceEngine::Blob::Ptr getImInfoBlob(const std::pair<size_t, size_t>& image_size,
+                                         size_t batchSize,
+                                         const std::pair<std::string, benchmark_app::InputInfo>& inputInfo) {
+    auto precision = inputInfo.second.precision;
+    if (precision == InferenceEngine::Precision::FP32) {
+        return createBlobImInfo<float>(image_size, batchSize, inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::FP16) {
+        return createBlobImInfo<short>(image_size, batchSize, inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::I32) {
+        return createBlobImInfo<int32_t>(image_size, batchSize, inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::I64) {
+        return createBlobImInfo<int64_t>(image_size, batchSize, inputInfo.second);
+    } else {
+        IE_THROW() << "Input precision is not supported for " << inputInfo.first;
+    }
+}
+
+InferenceEngine::Blob::Ptr getBinaryBlob(const std::vector<std::string>& files,
+                                         size_t inputId,
+                                         size_t batchSize,
+                                         const std::pair<std::string, benchmark_app::InputInfo>& inputInfo,
+                                         std::string* filenames_used = nullptr) {
+    auto precision = inputInfo.second.precision;
+    if (precision == InferenceEngine::Precision::FP32) {
+        return createBlobFromBinary<float>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::FP16) {
+        return createBlobFromBinary<short>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::I32) {
+        return createBlobFromBinary<int32_t>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if (precision == InferenceEngine::Precision::I64) {
+        return createBlobFromBinary<int64_t>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else if ((precision == InferenceEngine::Precision::U8) || (precision == InferenceEngine::Precision::BOOL)) {
+        return createBlobFromBinary<uint8_t>(files, inputId, batchSize, inputInfo.second, filenames_used);
+    } else {
+        IE_THROW() << "Input precision is not supported for " << inputInfo.first;
+    }
+}
+
+InferenceEngine::Blob::Ptr getRandomBlob(const std::pair<std::string, benchmark_app::InputInfo>& inputInfo) {
+    auto precision = inputInfo.second.precision;
+    if (precision == InferenceEngine::Precision::FP32) {
+        return createBlobRandom<float, float>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::FP16) {
+        return createBlobRandom<short, short>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::I32) {
+        return createBlobRandom<int32_t, int32_t>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::I64) {
+        return createBlobRandom<int64_t, int64_t>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::U8) {
+        // uniform_int_distribution<uint8_t> is not allowed in the C++17
+        // standard and vs2017/19
+        return createBlobRandom<uint8_t, uint32_t>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::I8) {
+        // uniform_int_distribution<int8_t> is not allowed in the C++17 standard
+        // and vs2017/19
+        return createBlobRandom<int8_t, int32_t>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::U16) {
+        return createBlobRandom<uint16_t, uint16_t>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::I16) {
+        return createBlobRandom<int16_t, int16_t>(inputInfo.second);
+    } else if (precision == InferenceEngine::Precision::BOOL) {
+        return createBlobRandom<uint8_t, uint32_t>(inputInfo.second, 0, 1);
+    } else {
+        IE_THROW() << "Input precision is not supported for " << inputInfo.first;
+    }
+}
+
+std::string getTestInfoStreamHeader(benchmark_app::InputInfo& inputInfo) {
+    std::stringstream strOut;
+    strOut << "(" << inputInfo.layout << ", " << inputInfo.precision << ", " << getShapeString(inputInfo.dataShape)
+           << ", ";
+    if (inputInfo.partialShape.is_dynamic()) {
+        strOut << std::string("dyn:") << inputInfo.partialShape << "):\t";
+    } else {
+        strOut << "static):\t";
+    }
+    return strOut.str();
+}
+
+std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getBlobs(
+    std::map<std::string, std::vector<std::string>>& inputFiles,
+    std::vector<benchmark_app::InputsInfo>& app_inputs_info) {
+    std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> blobs;
+    if (app_inputs_info.empty()) {
+        throw std::logic_error("Inputs Info for network is empty!");
    }

-    size_t imageInputCount = input_image_sizes.size();
-    size_t binaryInputCount = app_inputs_info.size() - imageInputCount;
+    if (!inputFiles.empty() && inputFiles.size() != app_inputs_info[0].size()) {
+        throw std::logic_error("Number of inputs specified in -i must be equal number of network inputs!");
+    }
+
+    // count image type inputs of network
+    std::vector<std::pair<size_t, size_t>> net_input_im_sizes;
+    for (auto& inputs_info : app_inputs_info) {
+        for (auto& input : inputs_info) {
+            if (input.second.isImage()) {
+                net_input_im_sizes.push_back(std::make_pair(input.second.width(), input.second.height()));
+            }
+        }
+    }
+
+    for (auto& files : inputFiles) {
+        if (!files.first.empty() && app_inputs_info[0].find(files.first) == app_inputs_info[0].end()) {
+            throw std::logic_error("Input name \"" + files.first +
+                                   "\" used in -i parameter doesn't match any network's input");
+        }
+
+        std::string input_name = files.first.empty() ? app_inputs_info[0].begin()->first : files.first;
+        auto input = app_inputs_info[0].at(input_name);
+        if (!files.second.empty() && files.second[0] != "random" && files.second[0] != "image_info") {
+            if (input.isImage()) {
+                files.second = filterFilesByExtensions(files.second, supported_image_extensions);
+            } else if (input.isImageInfo() && net_input_im_sizes.size() == app_inputs_info.size()) {
+                slog::info << "Input '" << input_name
+                           << "' probably is image info. All files for this input will"
+                              " be ignored."
+                           << slog::endl;
+                files.second = {"image_info"};
+                continue;
+            } else {
+                files.second = filterFilesByExtensions(files.second, supported_binary_extensions);
+            }
+        }
+
+        if (files.second.empty()) {
+            slog::warn << "No suitable files for input found! Random data will be used for input " << input_name
+                       << slog::endl;
+            files.second = {"random"};
+        }
+
+        size_t filesToBeUsed = 0;
+        size_t shapesToBeUsed = 0;
+        if (files.second.size() > app_inputs_info.size()) {
+            shapesToBeUsed = app_inputs_info.size();
+            filesToBeUsed = files.second.size() - files.second.size() % app_inputs_info.size();
+            if (filesToBeUsed != files.second.size()) {
+                slog::warn << "Number of files must be a multiple of the number of shapes for certain input. Only " +
+                                  std::to_string(filesToBeUsed) + " files will be added."
+                           << slog::endl;
+            }
+            while (files.second.size() != filesToBeUsed) {
+                files.second.pop_back();
+            }
+        } else {
+            shapesToBeUsed = app_inputs_info.size() - app_inputs_info.size() % files.second.size();
+            filesToBeUsed = files.second.size();
+            if (shapesToBeUsed != app_inputs_info.size()) {
+                slog::warn << "Number of data shapes must be a multiple of the number of files. For input "
+                           << files.first << " only " + std::to_string(shapesToBeUsed) + " files will be added."
+                           << slog::endl;
+            }
+            while (app_inputs_info.size() != shapesToBeUsed) {
+                app_inputs_info.pop_back();
+                net_input_im_sizes.pop_back();
+            }
+        }
+    }
+
+    std::vector<std::map<std::string, std::string>> logOutput;
+    // All inputs should process equal number of files, so for the case of N, 1, N number of files,
+    // second input also should have N blobs cloned from 1 file
+    size_t filesNum = 0;
+    if (!inputFiles.empty()) {
+        filesNum = std::max_element(inputFiles.begin(),
+                                    inputFiles.end(),
+                                    [](const std::pair<std::string, std::vector<std::string>>& a,
+                                       const std::pair<std::string, std::vector<std::string>>& b) {
+                                        return a.second.size() < b.second.size();
+                                    })
+                       ->second.size();
+    } else {
+        std::vector<std::pair<size_t, size_t>> net_input_im_sizes;
+        for (auto& input_info : app_inputs_info[0]) {
+            inputFiles[input_info.first] = {"random"};
+        }
+    }
+
+    for (const auto& files : inputFiles) {
+        std::string input_name = files.first.empty() ? app_inputs_info[0].begin()->first : files.first;
+        size_t n_shape = 0, m_file = 0;
+        while (n_shape < app_inputs_info.size() || m_file < filesNum) {
+            size_t batchSize = getBatchSize(app_inputs_info[n_shape % app_inputs_info.size()]);
+            size_t inputId = m_file % files.second.size();
+            auto input_info = app_inputs_info[n_shape % app_inputs_info.size()].at(input_name);
+
+            std::string blob_src_info;
+            if (files.second[0] == "random") {
+                // Fill random
+                blob_src_info =
+                    "random (" + std::string((input_info.isImage() ? "image" : "binary data")) + " is expected)";
+                blobs[input_name].push_back(getRandomBlob({input_name, input_info}));
+            } else if (files.second[0] == "image_info") {
+                // Most likely it is image info: fill with image information
+                auto image_size = net_input_im_sizes.at(n_shape % app_inputs_info.size());
+                blob_src_info =
+                    "Image size blob " + std::to_string(image_size.first) + " x " + std::to_string(image_size.second);
+                blobs[input_name].push_back(getImInfoBlob(image_size, batchSize, {input_name, input_info}));
+            } else if (input_info.isImage()) {
+                // Fill with Images
+                blobs[input_name].push_back(
+                    getImageBlob(files.second, inputId, batchSize, {input_name, input_info}, &blob_src_info));
+            } else {
+                // Fill with binary files
+                blobs[input_name].push_back(
+                    getBinaryBlob(files.second, inputId, batchSize, {input_name, input_info}, &blob_src_info));
+            }
+
+            // Preparing info
+            std::string strOut = getTestInfoStreamHeader(input_info) + blob_src_info;
+            if (n_shape >= logOutput.size()) {
+                logOutput.resize(n_shape + 1);
+            }
+            logOutput[n_shape][input_name] += strOut;
+
+            ++n_shape;
+            m_file += batchSize;
+        }
+    }
+
+    for (int i = 0; i < logOutput.size(); i++) {
+        slog::info << "Test Config " << i << slog::endl;
+        auto maxNameWidth = std::max_element(logOutput[i].begin(),
+                                             logOutput[i].end(),
+                                             [](const std::pair<std::string, std::string>& a,
+                                                const std::pair<std::string, std::string>& b) {
+                                                 return a.first.size() < b.first.size();
+                                             })
+                                ->first.size();
+        for (auto inputLog : logOutput[i]) {
+            slog::info << std::left << std::setw(maxNameWidth + 2) << inputLog.first << inputLog.second << slog::endl;
+        }
+    }
+
+    return blobs;
+}
+
+std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getBlobsStaticCase(
+    const std::vector<std::string>& inputFiles,
+    const size_t& batchSize,
+    benchmark_app::InputsInfo& app_inputs_info,
+    size_t requestsNum) {
+    std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> blobs;
+
+    std::vector<std::pair<size_t, size_t>> net_input_im_sizes;
+    for (auto& item : app_inputs_info) {
+        if (item.second.isImage()) {
+            net_input_im_sizes.push_back(std::make_pair(item.second.width(), item.second.height()));
+        }
+    }
+
+    size_t imageInputsNum = net_input_im_sizes.size();
+    size_t binaryInputsNum = app_inputs_info.size() - imageInputsNum;

    std::vector<std::string> binaryFiles;
    std::vector<std::string> imageFiles;
@ -255,7 +517,7 @@ void fillBlobs(const std::vector<std::string>& inputFiles,
        binaryFiles = filterFilesByExtensions(inputFiles, supported_binary_extensions);
        std::sort(std::begin(binaryFiles), std::end(binaryFiles));

-        auto binaryToBeUsed = binaryInputCount * batchSize * requests.size();
+        auto binaryToBeUsed = binaryInputsNum * batchSize * requestsNum;
        if (binaryToBeUsed > 0 && binaryFiles.empty()) {
            std::stringstream ss;
            for (auto& ext : supported_binary_extensions) {
@ -278,7 +540,7 @@ void fillBlobs(const std::vector<std::string>& inputFiles,
        imageFiles = filterFilesByExtensions(inputFiles, supported_image_extensions);
        std::sort(std::begin(imageFiles), std::end(imageFiles));

-        auto imagesToBeUsed = imageInputCount * batchSize * requests.size();
+        auto imagesToBeUsed = imageInputsNum * batchSize * requestsNum;
        if (imagesToBeUsed > 0 && imageFiles.empty()) {
            std::stringstream ss;
            for (auto& ext : supported_image_extensions) {
@ -299,156 +561,129 @@ void fillBlobs(const std::vector<std::string>& inputFiles,
        }
    }

-    for (size_t requestId = 0; requestId < requests.size(); requestId++) {
-        slog::info << "Infer Request " << requestId << " filling" << slog::endl;
+    std::map<std::string, std::vector<std::string>> mappedFiles;
+    size_t imageInputsCount = 0;
+    size_t binaryInputsCount = 0;
+    for (auto& input : app_inputs_info) {
+        if (input.second.isImage()) {
+            mappedFiles[input.first] = {};
+            for (size_t i = 0; i < imageFiles.size(); i += imageInputsNum) {
+                mappedFiles[input.first].push_back(
+                    imageFiles[(imageInputsCount + i) * imageInputsNum % imageFiles.size()]);
+            }
+            ++imageInputsCount;
+        } else {
+            mappedFiles[input.first] = {};
+            if (!binaryFiles.empty()) {
+                for (size_t i = 0; i < binaryFiles.size(); i += binaryInputsNum) {
+                    mappedFiles[input.first].push_back(binaryFiles[(binaryInputsCount + i) % binaryFiles.size()]);
+                }
+            }
+            ++binaryInputsCount;
+        }
+    }

+    size_t filesNum = 0;
+    if (!inputFiles.empty()) {
+        filesNum = std::max_element(mappedFiles.begin(),
+                                    mappedFiles.end(),
+                                    [](const std::pair<std::string, std::vector<std::string>>& a,
+                                       const std::pair<std::string, std::vector<std::string>>& b) {
+                                        return a.second.size() < b.second.size();
+                                    })
+                       ->second.size();
+    }
+    size_t test_configs_num = filesNum / batchSize == 0 ? 1 : filesNum / batchSize;
+    std::vector<std::map<std::string, std::string>> logOutput(test_configs_num);
+    for (const auto& files : mappedFiles) {
        size_t imageInputId = 0;
        size_t binaryInputId = 0;
-        for (auto& item : app_inputs_info) {
-            Blob::Ptr inputBlob = requests.at(requestId)->getBlob(item.first);
-            auto app_info = app_inputs_info.at(item.first);
-            auto precision = app_info.precision;
-            if (app_info.isImage()) {
+        auto input_name = files.first;
+        auto input_info = app_inputs_info.at(files.first);
+
+        for (size_t i = 0; i < test_configs_num; ++i) {
+            std::string blob_src_info;
+            if (input_info.isImage()) {
                if (!imageFiles.empty()) {
                    // Fill with Images
-                    if (precision == InferenceEngine::Precision::FP32) {
-                        fillBlobImage<float>(inputBlob,
-                                             imageFiles,
-                                             batchSize,
-                                             app_info,
-                                             requestId,
-                                             imageInputId++,
-                                             imageInputCount);
-                    } else if (precision == InferenceEngine::Precision::FP16) {
-                        fillBlobImage<short>(inputBlob,
-                                             imageFiles,
-                                             batchSize,
-                                             app_info,
-                                             requestId,
-                                             imageInputId++,
-                                             imageInputCount);
-                    } else if (precision == InferenceEngine::Precision::I32) {
-                        fillBlobImage<int32_t>(inputBlob,
-                                               imageFiles,
-                                               batchSize,
-                                               app_info,
-                                               requestId,
-                                               imageInputId++,
-                                               imageInputCount);
-                    } else if (precision == InferenceEngine::Precision::I64) {
-                        fillBlobImage<int64_t>(inputBlob,
-                                               imageFiles,
-                                               batchSize,
-                                               app_info,
-                                               requestId,
-                                               imageInputId++,
-                                               imageInputCount);
-                    } else if (precision == InferenceEngine::Precision::U8) {
-                        fillBlobImage<uint8_t>(inputBlob,
-                                               imageFiles,
-                                               batchSize,
-                                               app_info,
-                                               requestId,
-                                               imageInputId++,
-                                               imageInputCount);
-                    } else {
-                        IE_THROW() << "Input precision is not supported for " << item.first;
-                    }
+                    blobs[input_name].push_back(
+                        getImageBlob(files.second, imageInputId, batchSize, {input_name, input_info}, &blob_src_info));
+                    imageInputId = (imageInputId + batchSize) % files.second.size();
+                    logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info;
                    continue;
                }
            } else {
                if (!binaryFiles.empty()) {
                    // Fill with binary files
-                    if (precision == InferenceEngine::Precision::FP32) {
-                        fillBlobBinary<float>(inputBlob,
-                                              binaryFiles,
-                                              batchSize,
-                                              requestId,
-                                              binaryInputId++,
-                                              binaryInputCount);
-                    } else if (precision == InferenceEngine::Precision::FP16) {
-                        fillBlobBinary<short>(inputBlob,
-                                              binaryFiles,
-                                              batchSize,
-                                              requestId,
-                                              binaryInputId++,
-                                              binaryInputCount);
-                    } else if (precision == InferenceEngine::Precision::I32) {
-                        fillBlobBinary<int32_t>(inputBlob,
-                                                binaryFiles,
-                                                batchSize,
-                                                requestId,
-                                                binaryInputId++,
-                                                binaryInputCount);
-                    } else if (precision == InferenceEngine::Precision::I64) {
-                        fillBlobBinary<int64_t>(inputBlob,
-                                                binaryFiles,
-                                                batchSize,
-                                                requestId,
-                                                binaryInputId++,
-                                                binaryInputCount);
-                    } else if ((precision == InferenceEngine::Precision::U8) ||
-                               (precision == InferenceEngine::Precision::BOOL)) {
-                        fillBlobBinary<uint8_t>(inputBlob,
-                                                binaryFiles,
-                                                batchSize,
-                                                requestId,
-                                                binaryInputId++,
-                                                binaryInputCount);
-                    } else {
-                        IE_THROW() << "Input precision is not supported for " << item.first;
-                    }
+                    blobs[input_name].push_back(getBinaryBlob(files.second,
+                                                              binaryInputId,
+                                                              batchSize,
+                                                              {input_name, input_info},
+                                                              &blob_src_info));
+                    binaryInputId = (binaryInputId + batchSize) % files.second.size();
+                    logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info;
                    continue;
                }
-
-                if (app_info.isImageInfo() && (input_image_sizes.size() == 1)) {
+                if (input_info.isImageInfo() && (net_input_im_sizes.size() == 1)) {
                    // Most likely it is image info: fill with image information
-                    auto image_size = input_image_sizes.at(0);
-                    slog::info << "Fill input '" << item.first << "' with image size " << image_size.first << "x"
-                               << image_size.second << slog::endl;
-                    if (precision == InferenceEngine::Precision::FP32) {
-                        fillBlobImInfo<float>(inputBlob, batchSize, image_size);
-                    } else if (precision == InferenceEngine::Precision::FP16) {
-                        fillBlobImInfo<short>(inputBlob, batchSize, image_size);
-                    } else if (precision == InferenceEngine::Precision::I32) {
-                        fillBlobImInfo<int32_t>(inputBlob, batchSize, image_size);
-                    } else if (precision == InferenceEngine::Precision::I64) {
-                        fillBlobImInfo<int64_t>(inputBlob, batchSize, image_size);
-                    } else {
-                        IE_THROW() << "Input precision is not supported for image info!";
-                    }
+                    auto image_size = net_input_im_sizes.at(0);
+                    blob_src_info = "Image size blob " + std::to_string(image_size.first) + " x " +
+                                    std::to_string(image_size.second);
+                    blobs[input_name].push_back(getImInfoBlob(image_size, batchSize, {input_name, input_info}));
+                    logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info;
                    continue;
                }
            }
            // Fill random
-            slog::info << "Fill input '" << item.first << "' with random values ("
-                       << std::string((app_info.isImage() ? "image" : "some binary data")) << " is expected)"
-                       << slog::endl;
-            if (precision == InferenceEngine::Precision::FP32) {
-                fillBlobRandom<float, float>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::FP16) {
-                fillBlobRandom<short, short>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::I32) {
-                fillBlobRandom<int32_t, int32_t>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::I64) {
-                fillBlobRandom<int64_t, int64_t>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::U8) {
-                // uniform_int_distribution<uint8_t> is not allowed in the C++17
-                // standard and vs2017/19
-                fillBlobRandom<uint8_t, uint32_t>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::I8) {
-                // uniform_int_distribution<int8_t> is not allowed in the C++17 standard
-                // and vs2017/19
-                fillBlobRandom<int8_t, int32_t>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::U16) {
-                fillBlobRandom<uint16_t, uint16_t>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::I16) {
-                fillBlobRandom<int16_t, int16_t>(inputBlob);
-            } else if (precision == InferenceEngine::Precision::BOOL) {
-                fillBlobRandom<uint8_t, uint32_t>(inputBlob, 0, 1);
-            } else {
-                IE_THROW() << "Input precision is not supported for " << item.first;
-            }
+            blob_src_info =
+                "random (" + std::string((input_info.isImage() ? "image" : "binary data")) + " is expected)";
+            blobs[input_name].push_back(getRandomBlob({input_name, input_info}));
+            logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info;
        }
    }
+
+    for (int i = 0; i < logOutput.size(); i++) {
+        slog::info << "Test Config " << i << slog::endl;
+        auto maxNameWidth = std::max_element(logOutput[i].begin(),
+                                             logOutput[i].end(),
+                                             [](const std::pair<std::string, std::string>& a,
+                                                const std::pair<std::string, std::string>& b) {
+                                                 return a.first.size() < b.first.size();
+                                             })
+                                ->first.size();
+        for (auto inputLog : logOutput[i]) {
+            slog::info << std::left << std::setw(maxNameWidth + 2) << inputLog.first << inputLog.second << slog::endl;
+        }
+    }
+
+    return blobs;
+}
+
+void copyBlobData(InferenceEngine::Blob::Ptr& dst, const InferenceEngine::Blob::Ptr& src) {
+    if (src->getTensorDesc() != dst->getTensorDesc()) {
+        throw std::runtime_error(
+            "Source and destination blobs tensor descriptions are expected to be equal for data copying.");
+    }
+
+    InferenceEngine::MemoryBlob::Ptr srcMinput = as<InferenceEngine::MemoryBlob>(src);
+    if (!srcMinput) {
+        IE_THROW() << "We expect source blob to be inherited from MemoryBlob in "
+                      "fillBlobImage, "
+                   << "but by fact we were not able to cast source blob to MemoryBlob";
+    }
+    // locked memory holder should be alive all time while access to its buffer
+    // happens
+    auto srcMinputHolder = srcMinput->wmap();
+    auto srcBlobData = srcMinputHolder.as<void*>();
+
+    InferenceEngine::MemoryBlob::Ptr dstMinput = as<InferenceEngine::MemoryBlob>(dst);
+    if (!dstMinput) {
+        IE_THROW() << "We expect destination blob to be inherited from MemoryBlob in "
+                      "fillBlobImage, "
+                   << "but by fact we were not able to cast destination blob to MemoryBlob";
+    }
+    auto dstMinputHolder = dstMinput->wmap();
+    auto dstBlobData = dstMinputHolder.as<void*>();
+
+    std::memcpy(dstBlobData, srcBlobData, src->byteSize());
 }
--- a/samples/cpp/benchmark_app/inputs_filling.hpp
+++ b/samples/cpp/benchmark_app/inputs_filling.hpp
@ -14,7 +14,14 @@
 #include "utils.hpp"
 // clang-format on

-void fillBlobs(const std::vector<std::string>& inputFiles,
-               const size_t& batchSize,
-               benchmark_app::InputsInfo& app_inputs_info,
-               std::vector<InferReqWrap::Ptr> requests);
+std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getBlobs(
+    std::map<std::string, std::vector<std::string>>& inputFiles,
+    std::vector<benchmark_app::InputsInfo>& app_inputs_info);
+
+std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getBlobsStaticCase(
+    const std::vector<std::string>& inputFiles,
+    const size_t& batchSize,
+    benchmark_app::InputsInfo& app_inputs_info,
+    size_t requestsNum);
+
+void copyBlobData(InferenceEngine::Blob::Ptr& dst, const InferenceEngine::Blob::Ptr& src);
--- a/samples/cpp/benchmark_app/main.cpp
+++ b/samples/cpp/benchmark_app/main.cpp
@ -34,14 +34,6 @@ using namespace InferenceEngine;

 static const size_t progressBarDefaultTotalCount = 1000;

-uint64_t getDurationInMilliseconds(uint32_t duration) {
-    return duration * 1000LL;
-}
-
-uint64_t getDurationInNanoseconds(uint32_t duration) {
-    return duration * 1000000000LL;
-}
-
 bool ParseAndCheckCommandLine(int argc, char* argv[]) {
    // ---------------------------Parsing and validating input
    // arguments--------------------------------------
@ -104,7 +96,7 @@ static void next_step(const std::string additional_info = "") {
        {6, "Configuring input of the model"},
        {7, "Loading the model to the device"},
        {8, "Setting optimal runtime parameters"},
-        {9, "Creating infer requests and filling input blobs with images"},
+        {9, "Creating infer requests and preparing input blobs with data"},
        {10, "Measuring performance"},
        {11, "Dumping statistics report"}};

@ -116,13 +108,6 @@ static void next_step(const std::string additional_info = "") {
              << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
 }

-template <typename T>
-T getMedianValue(const std::vector<T>& vec, std::size_t percentile) {
-    std::vector<T> sortedVec(vec);
-    std::sort(sortedVec.begin(), sortedVec.end());
-    return sortedVec[(sortedVec.size() / 100) * percentile];
-}
-
 /**
 * @brief The entry point of the benchmark application
 */
@ -180,15 +165,15 @@ int main(int argc, char* argv[]) {
            load_config(FLAGS_load_config, config);
        }
 #endif
-        /** This vector stores paths to the processed images **/
-        std::vector<std::string> inputFiles;
-        parseInputFilesArguments(inputFiles);
+        /** This vector stores paths to the processed images with input names**/
+        auto inputFiles = parseInputArguments(gflags::GetArgvs());

        // ----------------- 2. Loading the Inference Engine
        // -----------------------------------------------------------
        next_step();

        Core ie;
+
        if (FLAGS_d.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
            // CPU (MKLDNN) extensions is loaded as a shared library and passed as a
            // pointer to base extension
@ -378,19 +363,10 @@ int main(int argc, char* argv[]) {
            ie.SetConfig(item.second, item.first);
        }

-        auto double_to_string = [](const double number) {
-            std::stringstream ss;
-            ss << std::fixed << std::setprecision(2) << number;
-            return ss.str();
-        };
-        auto get_total_ms_time = [](Time::time_point& startTime) {
-            return std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
-        };
-
        size_t batchSize = FLAGS_b;
        Precision precision = Precision::UNSPECIFIED;
        std::string topology_name = "";
-        benchmark_app::InputsInfo app_inputs_info;
+        std::vector<benchmark_app::InputsInfo> app_inputs_info;
        std::string output_name;

        // Takes priority over config from file
@ -398,6 +374,7 @@ int main(int argc, char* argv[]) {
            ie.SetConfig({{CONFIG_KEY(CACHE_DIR), FLAGS_cache_dir}});
        }

+        bool isDynamicNetwork = false;
        if (FLAGS_load_from_file && !isNetworkCompiled) {
            next_step();
            slog::info << "Skipping the step for loading network from file" << slog::endl;
@ -407,14 +384,15 @@ int main(int argc, char* argv[]) {
            slog::info << "Skipping the step for loading network from file" << slog::endl;
            auto startTime = Time::now();
            exeNetwork = ie.LoadNetwork(FLAGS_m, device_name);
-            auto duration_ms = double_to_string(get_total_ms_time(startTime));
+            auto duration_ms = double_to_string(get_duration_ms_till_now(startTime));
            slog::info << "Load network took " << duration_ms << " ms" << slog::endl;
            if (statistics)
                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                          {{"load network time (ms)", duration_ms}});
            app_inputs_info = getInputsInfo<InputInfo::CPtr>(FLAGS_shape,
                                                             FLAGS_layout,
-                                                             FLAGS_b,
+                                                             batchSize,
+                                                             FLAGS_data_shape,
                                                             FLAGS_iscale,
                                                             FLAGS_imean,
                                                             exeNetwork.GetInputsInfo());
@ -430,7 +408,7 @@ int main(int argc, char* argv[]) {

            auto startTime = Time::now();
            CNNNetwork cnnNetwork = ie.ReadNetwork(FLAGS_m);
-            auto duration_ms = double_to_string(get_total_ms_time(startTime));
+            auto duration_ms = double_to_string(get_duration_ms_till_now(startTime));
            slog::info << "Read network took " << duration_ms << " ms" << slog::endl;
            if (statistics)
                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
@ -444,34 +422,45 @@ int main(int argc, char* argv[]) {
            // ----------------- 5. Resizing network to match image sizes and given
            // batch ----------------------------------
            next_step();
-            batchSize = cnnNetwork.getBatchSize();
            // Parse input shapes if specified
            bool reshape = false;
            app_inputs_info = getInputsInfo<InputInfo::Ptr>(FLAGS_shape,
                                                            FLAGS_layout,
                                                            FLAGS_b,
+                                                            FLAGS_data_shape,
                                                            FLAGS_iscale,
                                                            FLAGS_imean,
                                                            inputInfo,
                                                            reshape);
            if (reshape) {
-                InferenceEngine::ICNNNetwork::InputShapes shapes = {};
-                for (auto& item : app_inputs_info)
-                    shapes[item.first] = item.second.shape;
+                benchmark_app::PartialShapes shapes = {};
+                for (auto& item : app_inputs_info[0])
+                    shapes[item.first] = item.second.partialShape;
                slog::info << "Reshaping network: " << getShapesString(shapes) << slog::endl;
                startTime = Time::now();
                cnnNetwork.reshape(shapes);
-                duration_ms = double_to_string(get_total_ms_time(startTime));
+                duration_ms = double_to_string(get_duration_ms_till_now(startTime));
                slog::info << "Reshape network took " << duration_ms << " ms" << slog::endl;
                if (statistics)
                    statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                              {{"reshape network time (ms)", duration_ms}});
            }
-            // use batch size according to provided layout and shapes
-            batchSize = (!FLAGS_layout.empty()) ? getBatchSize(app_inputs_info) : cnnNetwork.getBatchSize();
-
            topology_name = cnnNetwork.getName();
-            slog::info << (FLAGS_b != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize
+
+            // Check if network has dynamic shapes
+            auto input_info = app_inputs_info[0];
+            isDynamicNetwork = std::any_of(input_info.begin(),
+                                           input_info.end(),
+                                           [](const std::pair<std::string, benchmark_app::InputInfo>& i) {
+                                               return i.second.partialShape.is_dynamic();
+                                           });
+
+            // use batch size according to provided layout and shapes (static case)
+            if (batchSize == 0 || !isDynamicNetwork) {
+                batchSize = (!FLAGS_layout.empty()) ? getBatchSize(app_inputs_info[0]) : cnnNetwork.getBatchSize();
+            }
+
+            slog::info << (batchSize != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize
                       << slog::endl;

            // ----------------- 6. Configuring inputs and outputs
@ -482,11 +471,12 @@ int main(int argc, char* argv[]) {
            for (auto& item : cnnNetwork.getInputsInfo()) {
                // if precision for input set by user, then set it to app_inputs
                // if it an image, set U8
-                if (!FLAGS_ip.empty() || FLAGS_iop.find(item.first) != std::string::npos) {
-                    app_inputs_info.at(item.first).precision = item.second->getPrecision();
-                } else if (app_inputs_info.at(item.first).isImage()) {
-                    app_inputs_info.at(item.first).precision = Precision::U8;
-                    item.second->setPrecision(app_inputs_info.at(item.first).precision);
+                if (!FLAGS_ip.empty() || FLAGS_iop.find(item.first) != std::string::npos ||
+                    item.second->getPartialShape().is_dynamic()) {
+                    app_inputs_info[0].at(item.first).precision = item.second->getPrecision();
+                } else if (app_inputs_info[0].at(item.first).isImage()) {
+                    app_inputs_info[0].at(item.first).precision = Precision::U8;
+                    item.second->setPrecision(app_inputs_info[0].at(item.first).precision);
                }
            }

@ -496,7 +486,7 @@ int main(int argc, char* argv[]) {
            next_step();
            startTime = Time::now();
            exeNetwork = ie.LoadNetwork(cnnNetwork, device_name);
-            duration_ms = double_to_string(get_total_ms_time(startTime));
+            duration_ms = double_to_string(get_duration_ms_till_now(startTime));
            slog::info << "Load network took " << duration_ms << " ms" << slog::endl;
            if (statistics)
                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
@ -513,7 +503,7 @@ int main(int argc, char* argv[]) {
            next_step();
            auto startTime = Time::now();
            exeNetwork = ie.ImportNetwork(FLAGS_m, device_name, {});
-            auto duration_ms = double_to_string(get_total_ms_time(startTime));
+            auto duration_ms = double_to_string(get_duration_ms_till_now(startTime));
            slog::info << "Import network took " << duration_ms << " ms" << slog::endl;
            if (statistics)
                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
@ -521,6 +511,7 @@ int main(int argc, char* argv[]) {
            app_inputs_info = getInputsInfo<InputInfo::CPtr>(FLAGS_shape,
                                                             FLAGS_layout,
                                                             FLAGS_b,
+                                                             FLAGS_data_shape,
                                                             FLAGS_iscale,
                                                             FLAGS_imean,
                                                             exeNetwork.GetInputsInfo());
@ -528,6 +519,23 @@ int main(int argc, char* argv[]) {
                batchSize = 1;
            }
        }
+
+        if (isDynamicNetwork && FLAGS_api == "sync") {
+            throw std::logic_error("Benchmarking of the model with dynamic shapes is available for async API only."
+                                   "Please use -api async -nstreams 1 -nireq 1 to emulate sync behavior");
+        }
+
+        // Defining of benchmark mode
+        // for static models inference only mode is used as default one
+        bool inferenceOnly = FLAGS_inference_only;
+        if (isDynamicNetwork) {
+            if (isFlagSetInCommandLine("inference_only") && inferenceOnly && app_inputs_info.size() != 1) {
+                throw std::logic_error(
+                    "Dynamic models with different input data shapes must be benchmarked only in full mode.");
+            }
+            inferenceOnly = isFlagSetInCommandLine("inference_only") && inferenceOnly && app_inputs_info.size() == 1;
+        }
+
        // ----------------- 8. Querying optimal runtime parameters
        // -----------------------------------------------------
        next_step();
@ -573,11 +581,21 @@ int main(int argc, char* argv[]) {

        // Iteration limit
        uint32_t niter = FLAGS_niter;
+        size_t shape_groups_num = app_inputs_info.size();
        if ((niter > 0) && (FLAGS_api == "async")) {
-            niter = ((niter + nireq - 1) / nireq) * nireq;
-            if (FLAGS_niter != niter) {
-                slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to "
-                           << niter << " using number of requests " << nireq << slog::endl;
+            if (shape_groups_num > nireq) {
+                niter = ((niter + shape_groups_num - 1) / shape_groups_num) * shape_groups_num;
+                if (FLAGS_niter != niter) {
+                    slog::warn << "Number of iterations was aligned by data shape groups number from " << FLAGS_niter
+                               << " to " << niter << " using number of possible input shapes " << shape_groups_num
+                               << slog::endl;
+                }
+            } else {
+                niter = ((niter + nireq - 1) / nireq) * nireq;
+                if (FLAGS_niter != niter) {
+                    slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to "
+                               << niter << " using number of requests " << nireq << slog::endl;
+                }
            }
        }

@ -596,6 +614,7 @@ int main(int argc, char* argv[]) {
            statistics->addParameters(
                StatisticsReport::Category::RUNTIME_CONFIG,
                {
+                    {"benchmark mode", inferenceOnly ? "inference only" : "full"},
                    {"topology", topology_name},
                    {"target device", device_name},
                    {"API", FLAGS_api},
@ -619,18 +638,46 @@ int main(int argc, char* argv[]) {
        // ----------------------------------------
        next_step();

-        InferRequestsQueue inferRequestsQueue(exeNetwork, nireq);
-        if (isFlagSetInCommandLine("use_device_mem")) {
-            if (device_name.find("GPU") == 0)
-                ::gpu::fillRemoteBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests, exeNetwork);
-            else if (device_name.find("CPU") == 0)
-                fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
-            else
-                IE_THROW() << "Requested device doesn't support `use_device_mem` option.";
-        } else {
-            fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
-        }
+        InferRequestsQueue inferRequestsQueue(exeNetwork, nireq, app_inputs_info.size(), FLAGS_pcseq);

+        bool inputHasName = false;
+        if (inputFiles.size() > 0) {
+            inputHasName = inputFiles.begin()->first != "";
+        }
+        bool newInputType = isDynamicNetwork || inputHasName;
+        // create vector to store remote input blobs buffer
+        std::vector<::gpu::BufferType> clInputsBuffer;
+        bool useGpuMem = false;
+
+        std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> inputsData;
+        if (isFlagSetInCommandLine("use_device_mem")) {
+            if (device_name.find("GPU") == 0) {
+                inputsData = ::gpu::getRemoteInputBlobs(inputFiles, app_inputs_info, exeNetwork, clInputsBuffer);
+                useGpuMem = true;
+            } else if (device_name.find("CPU") == 0) {
+                if (newInputType) {
+                    inputsData = getBlobs(inputFiles, app_inputs_info);
+                } else {
+                    inputsData =
+                        getBlobsStaticCase(inputFiles.empty() ? std::vector<std::string>{} : inputFiles.begin()->second,
+                                           batchSize,
+                                           app_inputs_info[0],
+                                           nireq);
+                }
+            } else {
+                IE_THROW() << "Requested device doesn't support `use_device_mem` option.";
+            }
+        } else {
+            if (newInputType) {
+                inputsData = getBlobs(inputFiles, app_inputs_info);
+            } else {
+                inputsData =
+                    getBlobsStaticCase(inputFiles.empty() ? std::vector<std::string>{} : inputFiles.begin()->second,
+                                       batchSize,
+                                       app_inputs_info[0],
+                                       nireq);
+            }
+        }
        // ----------------- 10. Measuring performance
        // ------------------------------------------------------------------
        size_t progressCnt = 0;
@ -668,26 +715,91 @@ int main(int argc, char* argv[]) {
            }
            ss << niter << " iterations";
        }
+
        next_step(ss.str());

+        if (inferenceOnly) {
+            slog::info << "BENCHMARK IS IN INFERENCE ONLY MODE." << slog::endl;
+            slog::info << "Input blobs will be filled once before performance measurements." << slog::endl;
+        } else {
+            slog::info << "BENCHMARK IS IN FULL MODE." << slog::endl;
+            slog::info << "Inputs setup stage will be included in performance measurements." << slog::endl;
+        }
+
+        // copy prepared data straight into inferRequest->getBlob()
+        // for inference only mode
+        if (inferenceOnly) {
+            if (nireq < inputsData.begin()->second.size())
+                slog::warn << "Only " << nireq << " test configs will be used." << slog::endl;
+            size_t i = 0;
+            for (auto& inferRequest : inferRequestsQueue.requests) {
+                auto inputs = app_inputs_info[i % app_inputs_info.size()];
+                for (auto& item : inputs) {
+                    auto inputName = item.first;
+                    const auto& inputBlob = inputsData.at(inputName)[i % inputsData.at(inputName).size()];
+                    // for remote blobs setBlob is used, they are already allocated on the device
+                    if (useGpuMem) {
+                        inferRequest->setBlob(inputName, inputBlob);
+                    } else {
+                        InferenceEngine::Blob::Ptr requestBlob = inferRequest->getBlob(inputName);
+                        if (isDynamicNetwork) {
+                            requestBlob->setShape(inputBlob->getTensorDesc().getDims());
+                        }
+                        copyBlobData(requestBlob, inputBlob);
+                    }
+                }
+
+                if (useGpuMem) {
+                    auto outputBlobs = ::gpu::getRemoteOutputBlobs(exeNetwork, inferRequest->getOutputClBuffer());
+                    for (auto& output : exeNetwork.GetOutputsInfo()) {
+                        inferRequest->setBlob(output.first, outputBlobs[output.first]);
+                    }
+                }
+                ++i;
+            }
+        }
+
        // warming up - out of scope
        auto inferRequest = inferRequestsQueue.getIdleRequest();
        if (!inferRequest) {
            IE_THROW() << "No idle Infer Requests!";
        }
+
+        if (!inferenceOnly) {
+            auto inputs = app_inputs_info[0];
+
+            for (auto& item : inputs) {
+                auto inputName = item.first;
+                const auto& data = inputsData.at(inputName)[0];
+                inferRequest->setBlob(inputName, data);
+            }
+
+            if (useGpuMem) {
+                auto outputBlobs = ::gpu::getRemoteOutputBlobs(exeNetwork, inferRequest->getOutputClBuffer());
+                for (auto& output : exeNetwork.GetOutputsInfo()) {
+                    inferRequest->setBlob(output.first, outputBlobs[output.first]);
+                }
+            }
+        }
+
        if (FLAGS_api == "sync") {
            inferRequest->infer();
        } else {
            inferRequest->startAsync();
        }
+
        inferRequestsQueue.waitAll();
+
        auto duration_ms = double_to_string(inferRequestsQueue.getLatencies()[0]);
        slog::info << "First inference took " << duration_ms << " ms" << slog::endl;
-        if (statistics)
+
+        if (statistics) {
            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                      {{"first inference time (ms)", duration_ms}});
+        }
        inferRequestsQueue.resetTimes();

+        size_t processedFramesN = 0;
        auto startTime = Time::now();
        auto execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();

@ -695,7 +807,6 @@ int main(int argc, char* argv[]) {
        /** to align number if iterations to guarantee that last infer requests are
         * executed in the same conditions **/
        ProgressBar progressBar(progressBarTotalCount, FLAGS_stream_output, FLAGS_progress);
-
        while ((niter != 0LL && iteration < niter) ||
               (duration_nanoseconds != 0LL && (uint64_t)execTime < duration_nanoseconds) ||
               (FLAGS_api == "async" && iteration % nireq != 0)) {
@ -704,6 +815,31 @@ int main(int argc, char* argv[]) {
                IE_THROW() << "No idle Infer Requests!";
            }

+            if (!inferenceOnly) {
+                auto inputs = app_inputs_info[iteration % app_inputs_info.size()];
+
+                if (FLAGS_pcseq) {
+                    inferRequest->setLatencyGroupId(iteration % app_inputs_info.size());
+                }
+
+                if (isDynamicNetwork) {
+                    batchSize = getBatchSize(inputs);
+                }
+
+                for (auto& item : inputs) {
+                    auto inputName = item.first;
+                    const auto& data = inputsData.at(inputName)[iteration % inputsData.at(inputName).size()];
+                    inferRequest->setBlob(inputName, data);
+                }
+
+                if (useGpuMem) {
+                    auto outputBlobs = ::gpu::getRemoteOutputBlobs(exeNetwork, inferRequest->getOutputClBuffer());
+                    for (auto& output : exeNetwork.GetOutputsInfo()) {
+                        inferRequest->setBlob(output.first, outputBlobs[output.first]);
+                    }
+                }
+            }
+
            if (FLAGS_api == "sync") {
                inferRequest->infer();
            } else {
@ -716,9 +852,10 @@ int main(int argc, char* argv[]) {
                inferRequest->wait();
                inferRequest->startAsync();
            }
-            iteration++;
+            ++iteration;

            execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
+            processedFramesN += batchSize;

            if (niter > 0) {
                progressBar.addProgress(1);
@ -737,10 +874,17 @@ int main(int argc, char* argv[]) {
        // wait the latest inference executions
        inferRequestsQueue.waitAll();

-        double latency = getMedianValue<double>(inferRequestsQueue.getLatencies(), FLAGS_latency_percentile);
+        LatencyMetrics generalLatency(inferRequestsQueue.getLatencies());
+        std::vector<LatencyMetrics> groupLatencies = {};
+        if (FLAGS_pcseq && app_inputs_info.size() > 1) {
+            for (auto lats : inferRequestsQueue.getLatencyGroups()) {
+                groupLatencies.push_back(LatencyMetrics(lats));
+            }
+        }
+
        double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
-        double fps =
-            (FLAGS_api == "sync") ? batchSize * 1000.0 / latency : batchSize * 1000.0 * iteration / totalDuration;
+        double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / generalLatency.percentile(FLAGS_latency_percentile)
+                                           : 1000.0 * processedFramesN / totalDuration;

        if (statistics) {
            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
@ -751,19 +895,67 @@ int main(int argc, char* argv[]) {
            if (device_name.find("MULTI") == std::string::npos) {
                std::string latency_label;
                if (FLAGS_latency_percentile == 50) {
-                    latency_label = "latency (ms)";
+                    latency_label = "Median latency (ms)";
                } else {
                    latency_label = "latency (" + std::to_string(FLAGS_latency_percentile) + " percentile) (ms)";
                }
+                statistics->addParameters(
+                    StatisticsReport::Category::EXECUTION_RESULTS,
+                    {
+                        {latency_label, double_to_string(generalLatency.percentile(FLAGS_latency_percentile))},
+                    });
                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                          {
-                                              {latency_label, double_to_string(latency)},
+                                              {"Average latency (ms)", double_to_string(generalLatency.average())},
                                          });
+                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                          {
+                                              {"Min latency (ms)", double_to_string(generalLatency.min())},
+                                          });
+                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                          {
+                                              {"Max latency (ms)", double_to_string(generalLatency.max())},
+                                          });
+
+                if (FLAGS_pcseq && app_inputs_info.size() > 1) {
+                    statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                              {
+                                                  {"Latency for each data shape group:", ""},
+                                              });
+                    for (size_t i = 0; i < app_inputs_info.size(); ++i) {
+                        std::string data_shapes_string = "";
+                        data_shapes_string += std::to_string(i + 1) + ". ";
+                        for (auto& item : app_inputs_info[i]) {
+                            data_shapes_string += item.first + " : " + getShapeString(item.second.dataShape) + " ";
+                        }
+                        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                                  {
+                                                      {data_shapes_string, ""},
+                                                  });
+                        statistics->addParameters(
+                            StatisticsReport::Category::EXECUTION_RESULTS,
+                            {
+                                {latency_label,
+                                 double_to_string(groupLatencies[i].percentile(FLAGS_latency_percentile))},
+                            });
+                        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                                  {
+                                                      {"Average (ms)", double_to_string(groupLatencies[i].average())},
+                                                  });
+                        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                                  {
+                                                      {"Min (ms)", double_to_string(groupLatencies[i].min())},
+                                                  });
+                        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                                  {
+                                                      {"Max (ms)", double_to_string(groupLatencies[i].max())},
+                                                  });
+                    }
+                }
            }
            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                      {{"throughput", double_to_string(fps)}});
        }
-
        progressBar.finish();

        // ----------------- 11. Dumping statistics report
@ -805,18 +997,32 @@ int main(int argc, char* argv[]) {
        if (statistics)
            statistics->dump();

-        std::cout << "Count:      " << iteration << " iterations" << std::endl;
-        std::cout << "Duration:   " << double_to_string(totalDuration) << " ms" << std::endl;
+        // Performance metrics report
+        slog::info << "Count:      " << iteration << " iterations" << slog::endl;
+        slog::info << "Duration:   " << double_to_string(totalDuration) << " ms" << slog::endl;
        if (device_name.find("MULTI") == std::string::npos) {
-            std::cout << "Latency";
-            if (FLAGS_latency_percentile == 50) {
-                std::cout << ":    ";
-            } else {
-                std::cout << " (" << FLAGS_latency_percentile << " percentile):    ";
+            slog::info << "Latency: " << slog::endl;
+            generalLatency.logTotal(FLAGS_latency_percentile);
+
+            if (FLAGS_pcseq && app_inputs_info.size() > 1) {
+                slog::info << "Latency for each data shape group:" << slog::endl;
+                for (size_t i = 0; i < app_inputs_info.size(); ++i) {
+                    slog::info << (i + 1) << ".";
+                    for (auto& item : app_inputs_info[i]) {
+                        std::stringstream input_shape;
+                        auto shape = item.second.dataShape;
+                        std::copy(shape.begin(), shape.end() - 1, std::ostream_iterator<int>(input_shape, ","));
+                        input_shape << shape.back();
+                        slog::info << " " << item.first << " : " << getShapeString(item.second.dataShape);
+                    }
+                    slog::info << slog::endl;
+
+                    groupLatencies[i].logTotal(FLAGS_latency_percentile);
+                }
            }
-            std::cout << double_to_string(latency) << " ms" << std::endl;
        }
-        std::cout << "Throughput: " << double_to_string(fps) << " FPS" << std::endl;
+        slog::info << "Throughput: " << double_to_string(fps) << " FPS" << slog::endl;
+
    } catch (const std::exception& ex) {
        slog::err << ex.what() << slog::endl;

--- a/samples/cpp/benchmark_app/remote_blobs_filling.cpp
+++ b/samples/cpp/benchmark_app/remote_blobs_filling.cpp
@ -2,12 +2,15 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-// clang-format off
 #include <memory>
+#include <random>
 #include <string>
 #include <utility>
 #include <vector>

+// clang-format off
+#include <samples/slog.hpp>
+
 #include "remote_blobs_filling.hpp"
 // clang-format on

@ -85,58 +88,98 @@ size_t getBytesPerElement(InferenceEngine::Precision precision) {
    }
 }

-void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
-                     const size_t& batchSize,
-                     benchmark_app::InputsInfo& app_inputs_info,
-                     std::vector<InferReqWrap::Ptr> requests,
-                     const InferenceEngine::ExecutableNetwork& exeNetwork) {
+std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getRemoteInputBlobs(
+    const std::map<std::string, std::vector<std::string>>& inputFiles,
+    const std::vector<benchmark_app::InputsInfo>& app_inputs_info,
+    const InferenceEngine::ExecutableNetwork& exeNetwork,
+    std::vector<BufferType>& clBuffer) {
 #ifdef HAVE_DEVICE_MEM_SUPPORT
    slog::info << "Device memory will be used for input and output blobs" << slog::endl;
    if (inputFiles.size()) {
        slog::warn << "Device memory supports only random data at this moment, input images will be ignored"
                   << slog::endl;
    }
+
+    std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> remoteBlobs;
    auto context = exeNetwork.GetContext();
    auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
    auto oclInstance = std::make_shared<OpenCL>(oclContext);

-    auto setShared = [&](size_t requestId,
-                         const std::string name,
-                         const InferenceEngine::TensorDesc& desc,
-                         bool fillRandom = false) {
+    auto setShared = [&](const std::string name, const InferenceEngine::TensorDesc& desc, bool fillRandom = false) {
        cl_int err;
        auto inputDims = desc.getDims();
        auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
        auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());

-        cl::Buffer sharedBuffer =
-            cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
+        clBuffer.push_back(cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err));

        if (fillRandom) {
-            void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(sharedBuffer,
+            void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(),
                                                                   CL_TRUE,
                                                                   CL_MEM_READ_WRITE,
                                                                   0,
                                                                   (cl::size_type)inputSize);
            fillBuffer(mappedPtr, elementsNum, desc.getPrecision());
-            oclInstance->_queue.enqueueUnmapMemObject(sharedBuffer, mappedPtr);
+            oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr);
        }

-        InferenceEngine::Blob::Ptr sharedBlob = InferenceEngine::gpu::make_shared_blob(desc, context, sharedBuffer);
-
-        requests.at(requestId)->setBlob(name, sharedBlob);
+        auto blob = InferenceEngine::gpu::make_shared_blob(desc, context, clBuffer.back());
+        remoteBlobs[name].push_back(blob);
    };

-    for (size_t requestId = 0; requestId < requests.size(); requestId++) {
-        for (auto& item : exeNetwork.GetInputsInfo())
-            setShared(requestId, item.first, item.second->getTensorDesc(), true);
-
-        for (auto& item : exeNetwork.GetOutputsInfo())
-            setShared(requestId, item.first, item.second->getTensorDesc());
+    for (auto& inputs_info : app_inputs_info) {
+        for (auto& input : inputs_info) {
+            // Fill random
+            slog::info << "Prepare remote blob for input '" << input.first << "' with random values ("
+                       << std::string((input.second.isImage() ? "image" : "some binary data")) << " is expected)"
+                       << slog::endl;
+            setShared(input.first,
+                      InferenceEngine::TensorDesc(input.second.precision,
+                                                  input.second.dataShape,
+                                                  getLayoutFromString(input.second.layout)),
+                      true);
+        }
    }
+
+    return remoteBlobs;
 #else
    IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
 #endif
 }

+std::map<std::string, InferenceEngine::Blob::Ptr> getRemoteOutputBlobs(
+    const InferenceEngine::ExecutableNetwork& exeNetwork,
+    std::map<std::string, ::gpu::BufferType>& clBuffer) {
+#ifdef HAVE_DEVICE_MEM_SUPPORT
+    std::map<std::string, InferenceEngine::Blob::Ptr> outputBlobs;
+    for (auto& output : exeNetwork.GetOutputsInfo()) {
+        cl_int err;
+        auto context = exeNetwork.GetContext();
+        auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
+        auto oclInstance = std::make_shared<OpenCL>(oclContext);
+
+        auto desc = output.second->getTensorDesc();
+        auto inputDims = desc.getDims();
+        auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
+        auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());
+
+        cl::size_type bufferSize = 0;
+        if (clBuffer.find(output.first) == clBuffer.end()) {
+            clBuffer[output.first] =
+                cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
+        } else {
+            auto& buff = clBuffer[output.first];
+            buff.getInfo(CL_MEM_SIZE, &bufferSize);
+            if (inputSize != bufferSize) {
+                buff = cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
+            }
+        }
+        outputBlobs[output.first] = InferenceEngine::gpu::make_shared_blob(desc, context, clBuffer[output.first]);
+    }
+
+    return outputBlobs;
+#else
+    IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
+#endif
+}
 }  // namespace gpu
--- a/samples/cpp/benchmark_app/remote_blobs_filling.hpp
+++ b/samples/cpp/benchmark_app/remote_blobs_filling.hpp
@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#pragma once
+
 #if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
 #    define HAVE_DEVICE_MEM_SUPPORT
 #    include "gpu/gpu_context_api_ocl.hpp"
@ -10,13 +12,14 @@
 // clang-format off
 #include "inference_engine.hpp"

-#include "infer_request_wrap.hpp"
 #include "utils.hpp"
 // clang-format on

 namespace gpu {

 #ifdef HAVE_DEVICE_MEM_SUPPORT
+using BufferType = cl::Buffer;
+
 struct OpenCL {
    cl::Context _context;
    cl::Device _device;
@ -55,12 +58,18 @@ struct OpenCL {
        _queue = cl::CommandQueue(_context, _device, props);
    }
 };
+#else
+using BufferType = void*;
 #endif

-void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
-                     const size_t& batchSize,
-                     benchmark_app::InputsInfo& app_inputs_info,
-                     std::vector<InferReqWrap::Ptr> requests,
-                     const InferenceEngine::ExecutableNetwork& exeNetwork);
+std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getRemoteInputBlobs(
+    const std::map<std::string, std::vector<std::string>>& inputFiles,
+    const std::vector<benchmark_app::InputsInfo>& app_inputs_info,
+    const InferenceEngine::ExecutableNetwork& exeNetwork,
+    std::vector<BufferType>& clBuffer);
+
+std::map<std::string, InferenceEngine::Blob::Ptr> getRemoteOutputBlobs(
+    const InferenceEngine::ExecutableNetwork& exeNetwork,
+    std::map<std::string, ::gpu::BufferType>& clBuffer);

 }  // namespace gpu
--- a/samples/cpp/benchmark_app/shared_blob_allocator.hpp
+++ b/samples/cpp/benchmark_app/shared_blob_allocator.hpp
@ -0,0 +1,43 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_allocator.hpp"
+
+template <class T>
+class SharedBlobAllocator : public InferenceEngine::IAllocator {
+public:
+    SharedBlobAllocator(const T* data, size_t size) : data(data), size(size){};
+
+    ~SharedBlobAllocator() {
+        free((void*)data);
+    };
+
+    void* lock(void* handle, InferenceEngine::LockOp op = InferenceEngine::LOCK_FOR_WRITE) noexcept override {
+        if (handle == data) {
+            return (void*)data;
+        }
+        return nullptr;
+    }
+
+    void unlock(void* handle) noexcept override{};
+
+    void* alloc(size_t size) noexcept override {
+        return size <= this->size ? (void*)data : nullptr;
+    };
+
+    bool free(void* handle) noexcept override {
+        if (handle == data) {
+            delete[] data;
+            data = nullptr;
+            return true;
+        }
+        return false;
+    };
+
+private:
+    const T* data;
+    size_t size;
+};
--- a/samples/cpp/benchmark_app/statistics_report.hpp
+++ b/samples/cpp/benchmark_app/statistics_report.hpp
@ -11,9 +11,12 @@

 // clang-format off
 #include "inference_engine.hpp"
+
 #include "samples/common.hpp"
 #include "samples/csv_dumper.hpp"
 #include "samples/slog.hpp"
+
+#include "utils.hpp"
 // clang-format on

 // @brief statistics reports types
@ -21,6 +24,53 @@ static constexpr char noCntReport[] = "no_counters";
 static constexpr char averageCntReport[] = "average_counters";
 static constexpr char detailedCntReport[] = "detailed_counters";

+/// @brief Responsible for calculating different latency metrics
+class LatencyMetrics {
+public:
+    LatencyMetrics() = delete;
+
+    LatencyMetrics(const std::vector<double>& latencies) : latencies(latencies) {
+        if (latencies.empty()) {
+            throw std::logic_error("Latency metrics class expects non-empty vector of latencies at consturction.");
+        }
+        std::sort(this->latencies.begin(), this->latencies.end());
+    }
+
+    LatencyMetrics(std::vector<double>&& latencies) : latencies(latencies) {
+        if (latencies.empty()) {
+            throw std::logic_error("Latency metrics class expects non-empty vector of latencies at consturction.");
+        }
+        std::sort(this->latencies.begin(), this->latencies.end());
+    }
+
+    double min() {
+        return latencies[0];
+    }
+
+    double average() {
+        return std::accumulate(latencies.begin(), latencies.end(), 0.0) / latencies.size();
+    }
+
+    double percentile(std::size_t p) {
+        return latencies[size_t(latencies.size() / 100.0 * p)];
+    }
+
+    double max() {
+        return latencies.back();
+    }
+
+    void logTotal(size_t p) {
+        std::string percentileStr = (p == 50) ? "\tMedian:  " : "\t" + std::to_string(p) + " percentile:    ";
+        slog::info << percentileStr << double_to_string(percentile(p)) << " ms" << slog::endl;
+        slog::info << "\tAvg:    " << double_to_string(average()) << " ms" << slog::endl;
+        slog::info << "\tMin:    " << double_to_string(min()) << " ms" << slog::endl;
+        slog::info << "\tMax:    " << double_to_string(max()) << " ms" << slog::endl;
+    }
+
+private:
+    std::vector<double> latencies;
+};
+
 /// @brief Responsible for collecting of statistics and dumping to .csv file
 class StatisticsReport {
 public:
--- a/samples/cpp/benchmark_app/utils.cpp
+++ b/samples/cpp/benchmark_app/utils.cpp
@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-// clang-format off
 #include <algorithm>
 #include <map>
 #include <regex>
@ -10,8 +9,10 @@
 #include <utility>
 #include <vector>

-#include "samples/common.hpp"
-#include "samples/slog.hpp"
+// clang-format off
+#include <samples/args_helper.hpp>
+#include <samples/common.hpp>
+#include <samples/slog.hpp>

 #include "utils.hpp"
 // clang-format on
@ -35,7 +36,7 @@ size_t InputInfo::getDimentionByLayout(char character) const {
    size_t pos = layout.find(character);
    if (pos == std::string::npos)
        throw std::runtime_error("Error: Can't get " + std::string(character, 1) + " from layout " + layout);
-    return shape.at(pos);
+    return dataShape.at(pos);
 }
 size_t InputInfo::width() const {
    return getDimentionByLayout('W');
@ -152,8 +153,8 @@ size_t getBatchSize(const benchmark_app::InputsInfo& inputs_info) {
        std::size_t batch_index = info.second.layout.find("N");
        if (batch_index != std::string::npos) {
            if (batch_size == 0)
-                batch_size = info.second.shape[batch_index];
-            else if (batch_size != info.second.shape[batch_index])
+                batch_size = info.second.dataShape[batch_index];
+            else if (batch_size != info.second.dataShape[batch_index])
                throw std::logic_error("Can't deterimine batch size: batch is "
                                       "different for different inputs!");
        }
@ -163,6 +164,47 @@ size_t getBatchSize(const benchmark_app::InputsInfo& inputs_info) {
    return batch_size;
 }

+InferenceEngine::Layout getLayoutFromString(const std::string& string_layout) {
+    static const std::unordered_map<std::string, InferenceEngine::Layout> layouts = {
+        {"NCHW", InferenceEngine::Layout::NCHW},
+        {"NHWC", InferenceEngine::Layout::NHWC},
+        {"NCDHW", InferenceEngine::Layout::NCDHW},
+        {"NDHWC", InferenceEngine::Layout::NDHWC},
+        {"C", InferenceEngine::Layout::C},
+        {"CHW", InferenceEngine::Layout::CHW},
+        {"HWC", InferenceEngine::Layout::HWC},
+        {"HW", InferenceEngine::Layout::HW},
+        {"NC", InferenceEngine::Layout::NC},
+        {"CN", InferenceEngine::Layout::CN}};
+    auto it = layouts.find(string_layout);
+    if (it != layouts.end()) {
+        return it->second;
+    }
+    IE_THROW() << "Unknown layout with name '" << string_layout << "'.";
+}
+
+std::string getShapeString(const InferenceEngine::SizeVector& shape) {
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        if (i > 0)
+            ss << ", ";
+        ss << shape.at(i);
+    }
+    ss << "]";
+    return ss.str();
+}
+
+std::string getShapesString(const benchmark_app::PartialShapes& shapes) {
+    std::stringstream ss;
+    for (auto& shape : shapes) {
+        if (!ss.str().empty())
+            ss << ", ";
+        ss << "\'" << shape.first << "': " << shape.second;
+    }
+    return ss.str();
+}
+
 std::string getShapesString(const InferenceEngine::ICNNNetwork::InputShapes& shapes) {
    std::stringstream ss;
    for (auto& shape : shapes) {
@ -218,6 +260,120 @@ std::map<std::string, std::vector<float>> parseScaleOrMean(const std::string& sc
    return return_value;
 }

+std::vector<ngraph::Dimension> parsePartialShape(const std::string& partial_shape) {
+    std::vector<ngraph::Dimension> shape;
+    for (auto& dim : split(partial_shape, ',')) {
+        if (dim == "?" || dim == "-1") {
+            shape.push_back(ngraph::Dimension::dynamic());
+        } else {
+            const std::string range_divider = "..";
+            size_t range_index = dim.find(range_divider);
+            if (range_index != std::string::npos) {
+                std::string min = dim.substr(0, range_index);
+                std::string max = dim.substr(range_index + range_divider.length());
+                shape.push_back(ngraph::Dimension(min.empty() ? 0 : std::stoi(min),
+                                                  max.empty() ? ngraph::Interval::s_max : std::stoi(max)));
+            } else {
+                shape.push_back(std::stoi(dim));
+            }
+        }
+    }
+
+    return shape;
+}
+
+InferenceEngine::SizeVector parseTensorShape(const std::string& dataShape) {
+    std::vector<size_t> shape;
+    for (auto& dim : split(dataShape, ',')) {
+        shape.push_back(std::stoi(dim));
+    }
+    return shape;
+}
+
+std::pair<std::string, std::vector<std::string>> parseInputFiles(const std::string& file_paths_string) {
+    auto search_string = file_paths_string;
+    std::string input_name = "";
+    std::vector<std::string> file_paths;
+
+    // parse strings like <input1>:file1,file2,file3 and get name from them
+    size_t semicolon_pos = search_string.find_first_of(":");
+    size_t quote_pos = search_string.find_first_of("\"");
+    if (semicolon_pos != std::string::npos && quote_pos != std::string::npos && semicolon_pos > quote_pos) {
+        // if : is found after opening " symbol - this means that " belongs to pathname
+        semicolon_pos = std::string::npos;
+    }
+    if (search_string.length() > 2 && semicolon_pos == 1 && search_string[2] == '\\') {
+        // Special case like C:\ denotes drive name, not an input name
+        semicolon_pos = std::string::npos;
+    }
+
+    if (semicolon_pos != std::string::npos) {
+        input_name = search_string.substr(0, semicolon_pos);
+        search_string = search_string.substr(semicolon_pos + 1);
+    }
+
+    // parse file1,file2,file3 and get vector of paths
+    size_t coma_pos = 0;
+    do {
+        coma_pos = search_string.find_first_of(',');
+        file_paths.push_back(search_string.substr(0, coma_pos));
+        if (coma_pos == std::string::npos) {
+            search_string = "";
+            break;
+        }
+        search_string = search_string.substr(coma_pos + 1);
+    } while (coma_pos != std::string::npos);
+
+    if (!search_string.empty())
+        throw std::logic_error("Can't parse file paths for input " + input_name +
+                               " in input parameter string: " + file_paths_string);
+
+    return {input_name, file_paths};
+}
+
+std::map<std::string, std::vector<std::string>> parseInputArguments(const std::vector<std::string>& args) {
+    std::map<std::string, std::vector<std::string>> mapped_files = {};
+    auto args_it = begin(args);
+    const auto is_image_arg = [](const std::string& s) {
+        return s == "-i";
+    };
+    const auto is_arg = [](const std::string& s) {
+        return s.front() == '-';
+    };
+    while (args_it != args.end()) {
+        const auto files_start = std::find_if(args_it, end(args), is_image_arg);
+        if (files_start == end(args)) {
+            break;
+        }
+        const auto files_begin = std::next(files_start);
+        const auto files_end = std::find_if(files_begin, end(args), is_arg);
+        for (auto f = files_begin; f != files_end; ++f) {
+            auto files = parseInputFiles(*f);
+            if (mapped_files.find(files.first) == mapped_files.end()) {
+                mapped_files[files.first] = {};
+            }
+
+            for (auto& file : files.second) {
+                readInputFilesArguments(mapped_files[files.first], file);
+            }
+        }
+        args_it = files_end;
+    }
+    size_t max_files = 20;
+    for (auto& files : mapped_files) {
+        if (files.second.size() <= max_files) {
+            slog::info << "For input " << files.first << " " << files.second.size() << " files were added. "
+                       << slog::endl;
+        } else {
+            slog::info << "For input " << files.first << " " << files.second.size() << " files were added. "
+                       << " The number of files will be limited to " << max_files << "." << slog::endl;
+            files.second.resize(20);
+        }
+    }
+
+    return mapped_files;
+}
+
 #ifdef USE_OPENCV
 void dump_config(const std::string& filename, const std::map<std::string, std::map<std::string, std::string>>& config) {
    auto plugin_to_opencv_format = [](const std::string& str) -> std::string {
--- a/samples/cpp/benchmark_app/utils.hpp
+++ b/samples/cpp/benchmark_app/utils.hpp
@ -4,15 +4,43 @@

 #pragma once

+#include <chrono>
+#include <iomanip>
 #include <map>
+#include <samples/slog.hpp>
 #include <string>
 #include <vector>

+#include "ngraph/partial_shape.hpp"
+
+typedef std::chrono::high_resolution_clock Time;
+typedef std::chrono::nanoseconds ns;
+
+inline uint64_t getDurationInMilliseconds(uint32_t duration) {
+    return duration * 1000LL;
+}
+
+inline uint64_t getDurationInNanoseconds(uint32_t duration) {
+    return duration * 1000000000LL;
+}
+
+inline double get_duration_ms_till_now(Time::time_point& startTime) {
+    return std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
+};
+
+inline std::string double_to_string(const double number) {
+    std::stringstream ss;
+    ss << std::fixed << std::setprecision(2) << number;
+    return ss.str();
+};
+
 namespace benchmark_app {
 struct InputInfo {
    InferenceEngine::Precision precision;
-    InferenceEngine::SizeVector shape;
+    ngraph::PartialShape partialShape;
+    InferenceEngine::SizeVector dataShape;
    std::string layout;
+    InferenceEngine::Layout originalLayout;
    std::vector<float> scale;
    std::vector<float> mean;
    bool isImage() const;
@ -25,43 +53,56 @@ struct InputInfo {
    size_t depth() const;
 };
 using InputsInfo = std::map<std::string, InputInfo>;
+using PartialShapes = std::map<std::string, ngraph::PartialShape>;
 }  // namespace benchmark_app

 std::vector<std::string> parseDevices(const std::string& device_string);
 uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device);
 std::map<std::string, std::string> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
                                                               const std::string& values_string);
+
+InferenceEngine::Layout getLayoutFromString(const std::string& string_layout);
+std::string getShapeString(const InferenceEngine::SizeVector& shape);
+std::string getShapesString(const benchmark_app::PartialShapes& shapes);
 std::string getShapesString(const InferenceEngine::ICNNNetwork::InputShapes& shapes);
 size_t getBatchSize(const benchmark_app::InputsInfo& inputs_info);
 std::vector<std::string> split(const std::string& s, char delim);
+
 std::map<std::string, std::vector<float>> parseScaleOrMean(const std::string& scale_mean,
                                                           const benchmark_app::InputsInfo& inputs_info);
+std::vector<ngraph::Dimension> parsePartialShape(const std::string& partial_shape);
+InferenceEngine::SizeVector parseTensorShape(const std::string& data_shape);
+std::pair<std::string, std::vector<std::string>> parseInputFiles(const std::string& file_paths_string);
+std::map<std::string, std::vector<std::string>> parseInputArguments(const std::vector<std::string>& args);

 template <typename T>
-std::map<std::string, std::string> parseInputParameters(const std::string parameter_string,
-                                                        const std::map<std::string, T>& input_info) {
-    // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all
-    // inputs)
-    std::map<std::string, std::string> return_value;
+std::map<std::string, std::vector<std::string>> parseInputParameters(const std::string parameter_string,
+                                                                     const std::map<std::string, T>& input_info) {
+    // Parse parameter string like "[value0]", "[value0][value1]" or "input0[value0][value1],input1[value2][value3]"
+    // (applied to all inputs)
+    std::map<std::string, std::vector<std::string>> return_value;
    std::string search_string = parameter_string;
    auto start_pos = search_string.find_first_of('[');
+    auto input_name = search_string.substr(0, start_pos);
    while (start_pos != std::string::npos) {
        auto end_pos = search_string.find_first_of(']');
        if (end_pos == std::string::npos)
            break;
-        auto input_name = search_string.substr(0, start_pos);
+        if (start_pos)
+            input_name = search_string.substr(0, start_pos);
        auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1);
        if (!input_name.empty()) {
-            return_value[input_name] = input_value;
+            return_value[input_name].push_back(input_value);
        } else {
            for (auto& item : input_info) {
-                return_value[item.first] = input_value;
+                return_value[item.first].push_back(input_value);
            }
        }
        search_string = search_string.substr(end_pos + 1);
-        if (search_string.empty() || search_string.front() != ',')
+        if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '['))
            break;
-        search_string = search_string.substr(1);
+        if (search_string.front() == ',')
+            search_string = search_string.substr(1);
        start_pos = search_string.find_first_of('[');
    }
    if (!search_string.empty())
@ -70,87 +111,156 @@ std::map<std::string, std::string> parseInputParameters(const std::string parame
 }

 template <typename T>
-benchmark_app::InputsInfo getInputsInfo(const std::string& shape_string,
-                                        const std::string& layout_string,
-                                        const size_t batch_size,
-                                        const std::string& scale_string,
-                                        const std::string& mean_string,
-                                        const std::map<std::string, T>& input_info,
-                                        bool& reshape_required) {
-    std::map<std::string, std::string> shape_map = parseInputParameters(shape_string, input_info);
-    std::map<std::string, std::string> layout_map = parseInputParameters(layout_string, input_info);
+std::vector<benchmark_app::InputsInfo> getInputsInfo(const std::string& shape_string,
+                                                     const std::string& layout_string,
+                                                     const size_t batch_size,
+                                                     const std::string& data_shapes_string,
+                                                     const std::string& scale_string,
+                                                     const std::string& mean_string,
+                                                     const std::map<std::string, T>& input_info,
+                                                     bool& reshape_required) {
+    std::map<std::string, std::vector<std::string>> shape_map = parseInputParameters(shape_string, input_info);
+    std::map<std::string, std::vector<std::string>> data_shapes_map =
+        parseInputParameters(data_shapes_string, input_info);
+    std::map<std::string, std::vector<std::string>> layout_map = parseInputParameters(layout_string, input_info);
+
+    size_t min_size = 1, max_size = 1;
+    if (!data_shapes_map.empty()) {
+        min_size = std::min_element(data_shapes_map.begin(),
+                                    data_shapes_map.end(),
+                                    [](std::pair<std::string, std::vector<std::string>> a,
+                                       std::pair<std::string, std::vector<std::string>> b) {
+                                        return a.second.size() < b.second.size() && a.second.size() != 1;
+                                    })
+                       ->second.size();
+
+        max_size = std::max_element(data_shapes_map.begin(),
+                                    data_shapes_map.end(),
+                                    [](std::pair<std::string, std::vector<std::string>> a,
+                                       std::pair<std::string, std::vector<std::string>> b) {
+                                        return a.second.size() < b.second.size();
+                                    })
+                       ->second.size();
+        if (min_size != max_size) {
+            throw std::logic_error(
+                "Shapes number for every input should be either 1 or should be equal to shapes number of other inputs");
+        }
+    }

    reshape_required = false;
-    benchmark_app::InputsInfo info_map;
-    for (auto& item : input_info) {
-        benchmark_app::InputInfo info;
-        auto name = item.first;
-        auto descriptor = item.second->getTensorDesc();
-        // Precision
-        info.precision = descriptor.getPrecision();
-        // Shape
-        if (shape_map.count(name)) {
-            std::vector<size_t> parsed_shape;
-            for (auto& dim : split(shape_map.at(name), ',')) {
-                parsed_shape.push_back(std::stoi(dim));
-            }
-            info.shape = parsed_shape;
-            reshape_required = true;
-        } else {
-            info.shape = descriptor.getDims();
-        }
-        // Layout
-        if (layout_map.count(name)) {
-            info.layout = layout_map.at(name);
-            std::transform(info.layout.begin(), info.layout.end(), info.layout.begin(), ::toupper);
-        } else {
-            std::stringstream ss;
-            ss << descriptor.getLayout();
-            info.layout = ss.str();
-        }
-        // Update shape with batch if needed
-        if (batch_size != 0) {
-            std::size_t batch_index = info.layout.find("N");
-            if ((batch_index != std::string::npos) && (info.shape.at(batch_index) != batch_size)) {
-                info.shape[batch_index] = batch_size;
+
+    std::vector<benchmark_app::InputsInfo> info_maps;
+
+    for (size_t i = 0; i < min_size; ++i) {
+        benchmark_app::InputsInfo info_map;
+        for (auto& item : input_info) {
+            benchmark_app::InputInfo info;
+            auto name = item.first;
+            auto descriptor = item.second->getTensorDesc();
+            // Precision
+            info.precision = descriptor.getPrecision();
+            // Partial Shape
+            if (shape_map.count(name)) {
+                std::vector<ngraph::Dimension> parsed_shape;
+                if (shape_map.at(name).size() > 1) {
+                    throw std::logic_error(
+                        "shape command line parameter doesn't support multiple shapes for one input.");
+                }
+                info.partialShape = parsePartialShape(shape_map.at(name)[0]);
                reshape_required = true;
+            } else {
+                info.partialShape = item.second->getPartialShape();
+            }
+
+            if (info.partialShape.is_dynamic() && info.isImage()) {
+                throw std::logic_error(
+                    "benchmark_app supports only binary and random data as input for dynamic models at this moment.");
+            }
+
+            // Tensor Shape
+            if (info.partialShape.is_dynamic() && data_shapes_map.count(name)) {
+                info.dataShape = parseTensorShape(data_shapes_map.at(name)[i % data_shapes_map.at(name).size()]);
+            } else if (info.partialShape.is_static()) {
+                info.dataShape = info.partialShape.get_shape();
+                if (data_shapes_map.find(name) != data_shapes_map.end()) {
+                    throw std::logic_error(
+                        "Network's input \"" + name +
+                        "\" is static. Use -shape argument for static inputs instead of -data_shape.");
+                }
+            } else if (!data_shapes_map.empty()) {
+                throw std::logic_error("Can't find network input name \"" + name + "\" in \"-data_shape " +
+                                       data_shapes_string + "\" command line parameter");
+            } else {
+                throw std::logic_error(
+                    "data_shape command line parameter should be set in case of network with dynamic shapes.");
+            }
+
+            // Layout
+            info.originalLayout = descriptor.getLayout();
+            if (layout_map.count(name)) {
+                if (layout_map.at(name).size() > 1) {
+                    throw std::logic_error(
+                        "layout command line parameter doesn't support multiple layouts for one input.");
+                }
+                info.layout = layout_map.at(name)[0];
+                std::transform(info.layout.begin(), info.layout.end(), info.layout.begin(), ::toupper);
+            } else {
+                std::stringstream ss;
+                ss << descriptor.getLayout();
+                info.layout = ss.str();
+            }
+            // Update shape with batch if needed (only in static shape case)
+            // Update blob shape only not affecting network shape to trigger dynamic batch size case
+            if (batch_size != 0) {
+                std::size_t batch_index = info.layout.find("N");
+                if ((batch_index != std::string::npos) && (info.dataShape.at(batch_index) != batch_size)) {
+                    if (info.partialShape.is_static()) {
+                        info.partialShape[batch_index] = batch_size;
+                    }
+                    info.dataShape[batch_index] = batch_size;
+                    reshape_required = true;
+                }
+            }
+            info_map[name] = info;
+        }
+
+        // Update scale and mean
+        std::map<std::string, std::vector<float>> scale_map = parseScaleOrMean(scale_string, info_map);
+        std::map<std::string, std::vector<float>> mean_map = parseScaleOrMean(mean_string, info_map);
+
+        for (auto& item : info_map) {
+            if (item.second.isImage()) {
+                item.second.scale.assign({1, 1, 1});
+                item.second.mean.assign({0, 0, 0});
+
+                if (scale_map.count(item.first)) {
+                    item.second.scale = scale_map.at(item.first);
+                }
+                if (mean_map.count(item.first)) {
+                    item.second.mean = mean_map.at(item.first);
+                }
            }
        }
-        info_map[name] = info;
+
+        info_maps.push_back(info_map);
    }

-    // Update scale and mean
-    std::map<std::string, std::vector<float>> scale_map = parseScaleOrMean(scale_string, info_map);
-    std::map<std::string, std::vector<float>> mean_map = parseScaleOrMean(mean_string, info_map);
-
-    for (auto& item : info_map) {
-        if (item.second.isImage()) {
-            item.second.scale.assign({1, 1, 1});
-            item.second.mean.assign({0, 0, 0});
-
-            if (scale_map.count(item.first)) {
-                item.second.scale = scale_map.at(item.first);
-            }
-            if (mean_map.count(item.first)) {
-                item.second.mean = mean_map.at(item.first);
-            }
-        }
-    }
-
-    return info_map;
+    return info_maps;
 }

 template <typename T>
-benchmark_app::InputsInfo getInputsInfo(const std::string& shape_string,
-                                        const std::string& layout_string,
-                                        const size_t batch_size,
-                                        const std::string& scale_string,
-                                        const std::string& mean_string,
-                                        const std::map<std::string, T>& input_info) {
+std::vector<benchmark_app::InputsInfo> getInputsInfo(const std::string& shape_string,
+                                                     const std::string& layout_string,
+                                                     const size_t batch_size,
+                                                     const std::string& data_shapes_string,
+                                                     const std::string& scale_string,
+                                                     const std::string& mean_string,
+                                                     const std::map<std::string, T>& input_info) {
    bool reshape_required = false;
    return getInputsInfo<T>(shape_string,
                            layout_string,
                            batch_size,
+                            data_shapes_string,
                            scale_string,
                            mean_string,
                            input_info,
--- a/samples/cpp/common/utils/include/samples/common.hpp
+++ b/samples/cpp/common/utils/include/samples/common.hpp
@ -679,6 +679,15 @@ inline std::string getFullDeviceName(InferenceEngine::Core& ie, std::string devi
    }
 }

+inline std::string getFullDeviceName(ov::runtime::Core& ie, std::string device) {
+    InferenceEngine::Parameter p;
+    try {
+        p = ie.get_metric(device, METRIC_KEY(FULL_DEVICE_NAME));
+        return p.as<std::string>();
+    } catch (InferenceEngine::Exception&) {
+        return "";
+    }
+}
 /**
 * @brief This class represents an object that is found by an object detection net
 */
--- a/samples/cpp/speech_sample/main.cpp
+++ b/samples/cpp/speech_sample/main.cpp
--- a/samples/cpp/speech_sample/speech_sample.hpp
+++ b/samples/cpp/speech_sample/speech_sample.hpp
@ -1,7 +1,3 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
 #pragma once

 #include <gflags/gflags.h>
@ -228,3 +224,95 @@ static void showUsage() {
    std::cout << "    -iname \"<string>\"       " << input_layer_names_message << std::endl;
    std::cout << "    -pwl_me \"<double>\"      " << pwl_max_error_percent_message << std::endl;
 }
+
+/**
+ * @brief Checks input arguments
+ * @param argc number of args
+ * @param argv list of input arguments
+ * @return bool status true(Success) or false(Fail)
+ */
+bool ParseAndCheckCommandLine(int argc, char* argv[]) {
+    slog::info << "Parsing input parameters" << slog::endl;
+
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        showUsage();
+        showAvailableDevices();
+        return false;
+    }
+    bool isDumpMode = !FLAGS_wg.empty() || !FLAGS_we.empty();
+
+    // input not required only in dump mode and if external scale factor provided
+    if (FLAGS_i.empty() && (!isDumpMode || FLAGS_q.compare("user") != 0)) {
+        showUsage();
+        if (isDumpMode) {
+            throw std::logic_error("In model dump mode either static quantization is used (-i) or user scale"
+                                   " factor need to be provided. See -q user option");
+        }
+        throw std::logic_error("Input file not set. Please use -i.");
+    }
+
+    if (FLAGS_m.empty() && FLAGS_rg.empty()) {
+        showUsage();
+        throw std::logic_error("Either IR file (-m) or GNAModel file (-rg) need to be set.");
+    }
+
+    if ((!FLAGS_m.empty() && !FLAGS_rg.empty())) {
+        throw std::logic_error("Only one of -m and -rg is allowed.");
+    }
+
+    std::vector<std::string> supportedDevices = {"CPU",
+                                                 "GPU",
+                                                 "GNA_AUTO",
+                                                 "GNA_HW",
+                                                 "GNA_HW_WITH_SW_FBACK",
+                                                 "GNA_SW_EXACT",
+                                                 "GNA_SW",
+                                                 "GNA_SW_FP32",
+                                                 "HETERO:GNA,CPU",
+                                                 "HETERO:GNA_HW,CPU",
+                                                 "HETERO:GNA_SW_EXACT,CPU",
+                                                 "HETERO:GNA_SW,CPU",
+                                                 "HETERO:GNA_SW_FP32,CPU",
+                                                 "MYRIAD"};
+
+    if (std::find(supportedDevices.begin(), supportedDevices.end(), FLAGS_d) == supportedDevices.end()) {
+        throw std::logic_error("Specified device is not supported.");
+    }
+
+    uint32_t batchSize = (uint32_t)FLAGS_bs;
+    if ((batchSize < 1) || (batchSize > 8)) {
+        throw std::logic_error("Batch size out of range (1..8).");
+    }
+
+    /** default is a static quantization **/
+    if ((FLAGS_q.compare("static") != 0) && (FLAGS_q.compare("dynamic") != 0) && (FLAGS_q.compare("user") != 0)) {
+        throw std::logic_error("Quantization mode not supported (static, dynamic, user).");
+    }
+
+    if (FLAGS_q.compare("dynamic") == 0) {
+        throw std::logic_error("Dynamic quantization not yet supported.");
+    }
+
+    if (FLAGS_qb != 16 && FLAGS_qb != 8) {
+        throw std::logic_error("Only 8 or 16 bits supported.");
+    }
+
+    if (FLAGS_nthreads <= 0) {
+        throw std::logic_error("Invalid value for 'nthreads' argument. It must be greater that or equal to 0");
+    }
+
+    if (FLAGS_cw_r < 0) {
+        throw std::logic_error("Invalid value for 'cw_r' argument. It must be greater than or equal to 0");
+    }
+
+    if (FLAGS_cw_l < 0) {
+        throw std::logic_error("Invalid value for 'cw_l' argument. It must be greater than or equal to 0");
+    }
+
+    if (FLAGS_pwl_me < 0.0 || FLAGS_pwl_me > 100.0) {
+        throw std::logic_error("Invalid value for 'pwl_me' argument. It must be greater than 0.0 and less than 100.0");
+    }
+
+    return true;
+}
--- a/samples/cpp/speech_sample/utils.hpp
+++ b/samples/cpp/speech_sample/utils.hpp
@ -0,0 +1,406 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <cnpy.h>
+
+#include <samples/common.hpp>
+
+#define MAX_SCORE_DIFFERENCE 0.0001f  // max score difference for frame error threshold
+#define MAX_VAL_2B_FEAT      16384    // max to find scale factor
+
+typedef std::chrono::high_resolution_clock Time;
+typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
+typedef std::chrono::duration<float> fsec;
+
+/**
+ * @brief struct to store score error
+ */
+typedef struct {
+    uint32_t numScores;
+    uint32_t numErrors;
+    float threshold;
+    float maxError;
+    float rmsError;
+    float sumError;
+    float sumRmsError;
+    float sumSquaredError;
+    float maxRelError;
+    float sumRelError;
+    float sumSquaredRelError;
+} score_error_t;
+
+/**
+ * @brief struct to store infer request data per frame
+ */
+struct InferRequestStruct {
+    ov::runtime::InferRequest inferRequest;
+    int frameIndex;
+    uint32_t numFramesThisBatch;
+};
+
+/**
+ * @brief Check number of input files and model network inputs
+ * @param numInputs number model inputs
+ * @param numInputFiles number of input files
+ * @return none.
+ */
+void CheckNumberOfInputs(size_t numInputs, size_t numInputFiles) {
+    if (numInputs != numInputFiles) {
+        throw std::logic_error("Number of network inputs (" + std::to_string(numInputs) +
+                               ")"
+                               " is not equal to number of input files (" +
+                               std::to_string(numInputFiles) + ")");
+    }
+}
+
+/**
+ * @brief Get scale factor for quantization
+ * @param ptrFloatMemory pointer to float memory with speech feature vector
+ * @param targetMax max scale factor
+ * @param numElements number of elements in speech feature vector
+ * @return scale factor
+ */
+float ScaleFactorForQuantization(void* ptrFloatMemory, float targetMax, uint32_t numElements) {
+    float* ptrFloatFeat = reinterpret_cast<float*>(ptrFloatMemory);
+    float max = 0.0;
+    float scaleFactor;
+
+    for (uint32_t i = 0; i < numElements; i++) {
+        if (fabs(ptrFloatFeat[i]) > max) {
+            max = fabs(ptrFloatFeat[i]);
+        }
+    }
+
+    if (max == 0) {
+        scaleFactor = 1.0;
+    } else {
+        scaleFactor = targetMax / max;
+    }
+
+    return (scaleFactor);
+}
+
+/**
+ * @brief Clean score error
+ * @param error pointer to score error struct
+ * @return none.
+ */
+void ClearScoreError(score_error_t* error) {
+    error->numScores = 0;
+    error->numErrors = 0;
+    error->maxError = 0.0;
+    error->rmsError = 0.0;
+    error->sumError = 0.0;
+    error->sumRmsError = 0.0;
+    error->sumSquaredError = 0.0;
+    error->maxRelError = 0.0;
+    error->sumRelError = 0.0;
+    error->sumSquaredRelError = 0.0;
+}
+
+/**
+ * @brief Update total score error
+ * @param error pointer to score error struct
+ * @param totalError pointer to total score error struct
+ * @return none.
+ */
+void UpdateScoreError(score_error_t* error, score_error_t* totalError) {
+    totalError->numErrors += error->numErrors;
+    totalError->numScores += error->numScores;
+    totalError->sumRmsError += error->rmsError;
+    totalError->sumError += error->sumError;
+    totalError->sumSquaredError += error->sumSquaredError;
+    if (error->maxError > totalError->maxError) {
+        totalError->maxError = error->maxError;
+    }
+    totalError->sumRelError += error->sumRelError;
+    totalError->sumSquaredRelError += error->sumSquaredRelError;
+    if (error->maxRelError > totalError->maxRelError) {
+        totalError->maxRelError = error->maxRelError;
+    }
+}
+
+/**
+ * @brief Compare score errors, array should be the same length
+ * @param ptrScoreArray - pointer to score error struct array
+ * @param ptrRefScoreArray - pointer to score error struct array to compare
+ * @param scoreError - pointer to score error struct to save a new error
+ * @param numRows - number rows in score error arrays
+ * @param numColumns - number columns in score error arrays
+ * @return none.
+ */
+void CompareScores(float* ptrScoreArray,
+                   void* ptrRefScoreArray,
+                   score_error_t* scoreError,
+                   uint32_t numRows,
+                   uint32_t numColumns) {
+    uint32_t numErrors = 0;
+
+    ClearScoreError(scoreError);
+
+    float* A = ptrScoreArray;
+    float* B = reinterpret_cast<float*>(ptrRefScoreArray);
+    for (uint32_t i = 0; i < numRows; i++) {
+        for (uint32_t j = 0; j < numColumns; j++) {
+            float score = A[i * numColumns + j];
+            // std::cout << "score" << score << std::endl;
+            float refscore = B[i * numColumns + j];
+            float error = fabs(refscore - score);
+            float rel_error = error / (static_cast<float>(fabs(refscore)) + 1e-20f);
+            float squared_error = error * error;
+            float squared_rel_error = rel_error * rel_error;
+            scoreError->numScores++;
+            scoreError->sumError += error;
+            scoreError->sumSquaredError += squared_error;
+            if (error > scoreError->maxError) {
+                scoreError->maxError = error;
+            }
+            scoreError->sumRelError += rel_error;
+            scoreError->sumSquaredRelError += squared_rel_error;
+            if (rel_error > scoreError->maxRelError) {
+                scoreError->maxRelError = rel_error;
+            }
+            if (error > scoreError->threshold) {
+                numErrors++;
+            }
+        }
+    }
+    scoreError->rmsError = sqrt(scoreError->sumSquaredError / (numRows * numColumns));
+    scoreError->sumRmsError += scoreError->rmsError;
+    scoreError->numErrors = numErrors;
+    // std::cout << "rmsError=" << scoreError->rmsError << "sumRmsError="<<scoreError->sumRmsError;
+}
+
+/**
+ * @brief Get total stdev error
+ * @param error pointer to score error struct
+ * @return error
+ */
+float StdDevError(score_error_t error) {
+    return (sqrt(error.sumSquaredError / error.numScores -
+                 (error.sumError / error.numScores) * (error.sumError / error.numScores)));
+}
+
+#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
+#    ifdef _WIN32
+#        include <intrin.h>
+#        include <windows.h>
+#    else
+
+#        include <cpuid.h>
+
+#    endif
+
+inline void native_cpuid(unsigned int* eax, unsigned int* ebx, unsigned int* ecx, unsigned int* edx) {
+    size_t level = *eax;
+#    ifdef _WIN32
+    int regs[4] = {static_cast<int>(*eax), static_cast<int>(*ebx), static_cast<int>(*ecx), static_cast<int>(*edx)};
+    __cpuid(regs, level);
+    *eax = static_cast<uint32_t>(regs[0]);
+    *ebx = static_cast<uint32_t>(regs[1]);
+    *ecx = static_cast<uint32_t>(regs[2]);
+    *edx = static_cast<uint32_t>(regs[3]);
+#    else
+    __get_cpuid(level, eax, ebx, ecx, edx);
+#    endif
+}
+
+/**
+ * @brief Get GNA module frequency
+ * @return GNA module frequency in MHz
+ */
+float getGnaFrequencyMHz() {
+    uint32_t eax = 1;
+    uint32_t ebx = 0;
+    uint32_t ecx = 0;
+    uint32_t edx = 0;
+    uint32_t family = 0;
+    uint32_t model = 0;
+    const uint8_t sixth_family = 6;
+    const uint8_t cannon_lake_model = 102;
+    const uint8_t gemini_lake_model = 122;
+    const uint8_t ice_lake_model = 126;
+    const uint8_t tgl_model = 140;
+    const uint8_t next_model = 151;
+
+    native_cpuid(&eax, &ebx, &ecx, &edx);
+    family = (eax >> 8) & 0xF;
+
+    // model is the concatenation of two fields
+    // | extended model | model |
+    // copy extended model data
+    model = (eax >> 16) & 0xF;
+    // shift
+    model <<= 4;
+    // copy model data
+    model += (eax >> 4) & 0xF;
+
+    if (family == sixth_family) {
+        switch (model) {
+        case cannon_lake_model:
+        case ice_lake_model:
+        case tgl_model:
+        case next_model:
+            return 400;
+        case gemini_lake_model:
+            return 200;
+        default:
+            return 1;
+        }
+    } else {
+        // counters not supported and we returns just default value
+        return 1;
+    }
+}
+
+#endif  // if not ARM
+
+/**
+ * @brief Print a report on the statistical score error
+ * @param totalError reference to a total score error struct
+ * @param framesNum number of frames in utterance
+ * @param stream output stream
+ * @return none.
+ */
+void printReferenceCompareResults(score_error_t const& totalError, size_t framesNum, std::ostream& stream) {
+    stream << "         max error: " << totalError.maxError << std::endl;
+    stream << "         avg error: " << totalError.sumError / totalError.numScores << std::endl;
+    stream << "     avg rms error: " << totalError.sumRmsError / framesNum << std::endl;
+    stream << "       stdev error: " << StdDevError(totalError) << std::endl << std::endl;
+    stream << std::endl;
+}
+
+/**
+ * @brief Print a report on the performance counts
+ * @param utterancePerfMap reference to a map to store performance counters
+ * @param numberOfFrames number of frames
+ * @param stream output stream
+ * @param fullDeviceName full device name string
+ * @param numberOfFramesOnHw number of frames delivered to GNA HW
+ * @param FLAGS_d flag of device
+ * @return none.
+ */
+void printPerformanceCounters(std::map<std::string, ov::runtime::ProfilingInfo> const& utterancePerfMap,
+                              size_t numberOfFrames,
+                              std::ostream& stream,
+                              std::string fullDeviceName,
+                              const uint64_t numberOfFramesOnHw,
+                              std::string FLAGS_d) {
+#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
+    stream << std::endl << "Performance counts:" << std::endl;
+    stream << std::setw(10) << std::right << ""
+           << "Counter descriptions";
+    stream << std::setw(22) << "Utt scoring time";
+    stream << std::setw(18) << "Avg infer time";
+    stream << std::endl;
+
+    stream << std::setw(46) << "(ms)";
+    stream << std::setw(24) << "(us per call)";
+    stream << std::endl;
+    // if GNA HW counters
+    // get frequency of GNA module
+    float freq = getGnaFrequencyMHz();
+    for (const auto& it : utterancePerfMap) {
+        std::string const& counter_name = it.first;
+        float current_units_us = static_cast<float>(it.second.real_time.count()) / freq;
+        float call_units_us = current_units_us / numberOfFrames;
+        if (FLAGS_d.find("GNA") != std::string::npos) {
+            stream << std::setw(30) << std::left << counter_name.substr(4, counter_name.size() - 1);
+        } else {
+            stream << std::setw(30) << std::left << counter_name;
+        }
+        stream << std::setw(16) << std::right << current_units_us / 1000;
+        stream << std::setw(21) << std::right << call_units_us;
+        stream << std::endl;
+    }
+    stream << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << fullDeviceName << std::endl;
+    std::cout << std::endl;
+    stream << "Number of frames delivered to GNA HW: " << numberOfFramesOnHw;
+    stream << "/" << numberOfFrames;
+    stream << std::endl;
+#endif
+}
+
+/**
+ * @brief Get performance counts
+ * @param request reference to infer request
+ * @param perfCounters reference to a map to save performance counters
+ * @return none.
+ */
+void getPerformanceCounters(ov::runtime::InferRequest& request,
+                            std::map<std::string, ov::runtime::ProfilingInfo>& perfCounters) {
+    auto retPerfCounters = request.get_profiling_info();
+
+    for (const auto& element : retPerfCounters) {
+        perfCounters[element.node_name] = element;
+    }
+}
+
+/**
+ * @brief Summarize performance counts and total number of frames executed on the GNA HW device
+ * @param perfCounters reference to a map to get performance counters
+ * @param totalPerfCounters reference to a map to save total performance counters
+ * @param totalRunsOnHw reference to a total number of frames computed on GNA HW
+ * @return none.
+ */
+void sumPerformanceCounters(std::map<std::string, ov::runtime::ProfilingInfo> const& perfCounters,
+                            std::map<std::string, ov::runtime::ProfilingInfo>& totalPerfCounters,
+                            uint64_t& totalRunsOnHw) {
+    auto runOnHw = false;
+    for (const auto& pair : perfCounters) {
+        totalPerfCounters[pair.first].real_time += pair.second.real_time;
+        runOnHw |= pair.second.real_time > std::chrono::microseconds(0);  // if realTime is above zero, that means that
+                                                                          // a primitive was executed on the device
+    }
+    totalRunsOnHw += runOnHw;
+}
+
+/**
+ * @brief Parse scale factors
+ * @param str reference to user-specified input scale factor for quantization, can be separated by comma
+ * @return vector scale factors
+ */
+std::vector<std::string> ParseScaleFactors(const std::string& str) {
+    std::vector<std::string> scaleFactorInput;
+
+    if (!str.empty()) {
+        std::string outStr;
+        std::istringstream stream(str);
+        int i = 0;
+        while (getline(stream, outStr, ',')) {
+            auto floatScaleFactor = std::stof(outStr);
+            if (floatScaleFactor <= 0.0f) {
+                throw std::logic_error("Scale factor for input #" + std::to_string(i) +
+                                       " (counting from zero) is out of range (must be positive).");
+            }
+            scaleFactorInput.push_back(outStr);
+            i++;
+        }
+    } else {
+        throw std::logic_error("Scale factor need to be specified via -sf option if you are using -q user");
+    }
+    return scaleFactorInput;
+}
+
+/**
+ * @brief Parse string of file names separated by comma to save it to vector of file names
+ * @param str file names separated by comma
+ * @return vector of file names
+ */
+std::vector<std::string> ConvertStrToVector(std::string str) {
+    std::vector<std::string> blobName;
+    if (!str.empty()) {
+        size_t pos_last = 0;
+        size_t pos_next = 0;
+        while ((pos_next = str.find(",", pos_last)) != std::string::npos) {
+            blobName.push_back(str.substr(pos_last, pos_next - pos_last));
+            pos_last = pos_next + 1;
+        }
+        blobName.push_back(str.substr(pos_last));
+    }
+    return blobName;
+}
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -31,19 +31,19 @@ endif()

 if(ENABLE_OV_IR_FRONTEND)
    if(BUILD_SHARED_LIBS)
-        add_dependencies(ov_runtime_libraries ir_ov_frontend)
+        add_dependencies(ov_runtime_libraries ov_ir_frontend)
    endif()
    # use this one once CVS-69781 is fixed
-    # add_dependencies(inference_engine ir_ov_frontend)
+    # add_dependencies(inference_engine ov_ir_frontend)
 endif()
 if(ENABLE_OV_ONNX_FRONTEND)
-    add_dependencies(inference_engine onnx_ov_frontend)
+    add_dependencies(inference_engine ov_onnx_frontend)
 endif()

 if(ENABLE_OV_PDPD_FRONTEND)
-    add_dependencies(inference_engine paddlepaddle_ov_frontend)
+    add_dependencies(inference_engine ov_paddlepaddle_frontend)
 endif()

 if(ENABLE_OV_TF_FRONTEND)
-    add_dependencies(inference_engine tensorflow_ov_frontend)
+    add_dependencies(inference_engine ov_tensorflow_frontend)
 endif()
--- a/src/bindings/python/CMakeLists.txt
+++ b/src/bindings/python/CMakeLists.txt
@ -17,9 +17,9 @@ set(LIBRARY_OUTPUT_DIRECTORY_BIN ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 add_subdirectory(src)

 if(ENABLE_OV_CORE_UNIT_TESTS)
-    add_subdirectory(tests/mock/mock_py_ov_frontend)
-    add_dependencies(pyopenvino mock_py_ov_frontend)
-    set_target_properties(mock_py_ov_frontend PROPERTIES
+    add_subdirectory(tests/mock/ov_mock_py_frontend)
+    add_dependencies(pyopenvino ov_mock_py_frontend)
+    set_target_properties(ov_mock_py_frontend PROPERTIES
            LIBRARY_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY_BIN}
            ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY_BIN}
            COMPILE_PDB_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_DIRECTORY_BIN}
--- a/src/bindings/python/src/openvino/frontend/init.py
+++ b/src/bindings/python/src/openvino/frontend/init.py
@ -41,6 +41,8 @@ from openvino.pyopenvino import FrontEnd
 from openvino.pyopenvino import InputModel
 from openvino.pyopenvino import Place
 from openvino.pyopenvino import TelemetryExtension
+from openvino.pyopenvino import DecoderTransformationExtension
+from openvino.pyopenvino import JsonConfigExtension

 # exceptions
 from openvino.pyopenvino import NotImplementedFailure
--- a/src/bindings/python/src/pyopenvino/core/common.cpp
+++ b/src/bindings/python/src/pyopenvino/core/common.cpp
@ -321,68 +321,64 @@ py::dict outputs_to_dict(const std::vector<ov::Output<const ov::Node>>& outputs,
        ov::runtime::Tensor t{request.get_tensor(out)};
        switch (t.get_element_type()) {
        case ov::element::Type_t::i8: {
-            py::array arr(t.get_shape(), t.data<int8_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<int8_t>(t.get_shape(), t.data<int8_t>());
+            ;
            break;
        }
        case ov::element::Type_t::i16: {
-            py::array arr(t.get_shape(), t.data<int16_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<int16_t>(t.get_shape(), t.data<int16_t>());
+            ;
            break;
        }
        case ov::element::Type_t::i32: {
-            py::array arr(t.get_shape(), t.data<int32_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<int32_t>(t.get_shape(), t.data<int32_t>());
+            ;
            break;
        }
        case ov::element::Type_t::i64: {
-            py::array arr(t.get_shape(), t.data<int64_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<int64_t>(t.get_shape(), t.data<int64_t>());
+            ;
            break;
        }
        case ov::element::Type_t::u8: {
-            py::array arr(t.get_shape(), t.data<uint8_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<uint8_t>(t.get_shape(), t.data<uint8_t>());
+            ;
            break;
        }
        case ov::element::Type_t::u16: {
-            py::array arr(t.get_shape(), t.data<uint16_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<uint16_t>(t.get_shape(), t.data<uint16_t>());
            break;
        }
        case ov::element::Type_t::u32: {
-            py::array arr(t.get_shape(), t.data<uint32_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<uint32_t>(t.get_shape(), t.data<uint32_t>());
+            ;
            break;
        }
        case ov::element::Type_t::u64: {
-            py::array arr(t.get_shape(), t.data<uint64_t>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<uint64_t>(t.get_shape(), t.data<uint64_t>());
            break;
        }
        case ov::element::Type_t::bf16: {
-            py::array arr(t.get_shape(), t.data<ov::bfloat16>());
-            res[py::cast(out)] = arr.view("int16");
+            res[py::cast(out)] = py::array(py::dtype("float16"), t.get_shape(), t.data<ov::bfloat16>());
            break;
        }
        case ov::element::Type_t::f16: {
-            py::array arr(t.get_shape(), t.data<ov::float16>());
-            res[py::cast(out)] = arr.view("int16");
+            res[py::cast(out)] = py::array(py::dtype("float16"), t.get_shape(), t.data<ov::float16>());
            break;
        }
        case ov::element::Type_t::f32: {
-            py::array arr(t.get_shape(), t.data<float>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<float>(t.get_shape(), t.data<float>());
+            ;
            break;
        }
        case ov::element::Type_t::f64: {
-            py::array arr(t.get_shape(), t.data<double>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<double>(t.get_shape(), t.data<double>());
+            ;
            break;
        }
        case ov::element::Type_t::boolean: {
-            py::array arr(t.get_shape(), t.data<bool*>());
-            res[py::cast(out)] = arr;
+            res[py::cast(out)] = py::array_t<bool>(t.get_shape(), t.data<bool>());
+            ;
            break;
        }
        default: {
--- a/src/bindings/python/src/pyopenvino/core/extension.cpp
+++ b/src/bindings/python/src/pyopenvino/core/extension.cpp
@ -0,0 +1,16 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+#include "openvino/frontend/manager.hpp"
+
+namespace py = pybind11;
+
+void regclass_Extension(py::module m) {
+    py::class_<ov::Extension, std::shared_ptr<ov::Extension>> ext(m, "Extension", py::dynamic_attr());
+}
--- a/src/bindings/python/src/pyopenvino/core/extension.hpp
+++ b/src/bindings/python/src/pyopenvino/core/extension.hpp
@ -0,0 +1,11 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+void regclass_Extension(py::module m);
--- a/src/bindings/python/src/pyopenvino/core/infer_request.cpp
+++ b/src/bindings/python/src/pyopenvino/core/infer_request.cpp
@ -65,7 +65,6 @@ void regclass_InferRequest(py::module m) {
            self._start_time = Time::now();
            self._request.infer();
            self._end_time = Time::now();
-
            return Common::outputs_to_dict(self._outputs, self._request);
        },
        py::arg("inputs"));
--- a/src/bindings/python/src/pyopenvino/frontend/extensions.cpp
+++ b/src/bindings/python/src/pyopenvino/frontend/extensions.cpp
@ -0,0 +1,56 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+#include "extension/json_config.hpp"
+#include "manager.hpp"
+#include "openvino/frontend/exception.hpp"
+#include "openvino/frontend/extension/decoder_transformation.hpp"
+#include "openvino/frontend/extension/telemetry.hpp"
+#include "pyopenvino/graph/function.hpp"
+
+namespace py = pybind11;
+
+using namespace ov::frontend;
+
+void regclass_frontend_TelemetryExtension(py::module m) {
+    py::class_<TelemetryExtension, std::shared_ptr<TelemetryExtension>, ov::Extension> ext(m,
+                                                                                           "TelemetryExtension",
+                                                                                           py::dynamic_attr());
+
+    ext.def(py::init([](const std::string& event_category,
+                        const TelemetryExtension::event_callback& send_event,
+                        const TelemetryExtension::error_callback& send_error,
+                        const TelemetryExtension::error_callback& send_stack_trace) {
+        return std::make_shared<TelemetryExtension>(event_category, send_event, send_error, send_stack_trace);
+    }));
+
+    ext.def("send_event", &TelemetryExtension::send_event);
+    ext.def("send_error", &TelemetryExtension::send_error);
+    ext.def("send_stack_trace", &TelemetryExtension::send_stack_trace);
+}
+
+void regclass_frontend_DecoderTransformationExtension(py::module m) {
+    py::class_<ov::frontend::DecoderTransformationExtension,
+               std::shared_ptr<ov::frontend::DecoderTransformationExtension>,
+               ov::Extension>
+        ext(m, "DecoderTransformationExtension", py::dynamic_attr());
+}
+
+void regclass_frontend_JsonConfigExtension(py::module m) {
+    py::class_<ov::frontend::JsonConfigExtension,
+               std::shared_ptr<ov::frontend::JsonConfigExtension>,
+               ov::frontend::DecoderTransformationExtension>
+        ext(m, "JsonConfigExtension", py::dynamic_attr());
+
+    ext.doc() = "Extension class to load and process ModelOptimizer JSON config file";
+
+    ext.def(py::init([](const std::string& path) {
+        return std::make_shared<ov::frontend::JsonConfigExtension>(path);
+    }));
+}
--- a/src/bindings/python/src/pyopenvino/frontend/extensions.hpp
+++ b/src/bindings/python/src/pyopenvino/frontend/extensions.hpp
@ -0,0 +1,13 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+void regclass_frontend_TelemetryExtension(py::module m);
+void regclass_frontend_DecoderTransformationExtension(py::module m);
+void regclass_frontend_JsonConfigExtension(py::module m);
--- a/src/bindings/python/src/pyopenvino/frontend/frontend.cpp
+++ b/src/bindings/python/src/pyopenvino/frontend/frontend.cpp
@ -8,8 +8,8 @@
 #include <pybind11/stl_bind.h>

 #include "openvino/frontend/exception.hpp"
+#include "openvino/frontend/extension/telemetry.hpp"
 #include "openvino/frontend/manager.hpp"
-#include "openvino/frontend/telemetry_extension.hpp"
 #include "pyopenvino/graph/function.hpp"

 namespace py = pybind11;
@ -41,7 +41,7 @@ void regclass_frontend_FrontEnd(py::module m) {
             )");

    fem.def("convert",
-            static_cast<std::shared_ptr<ov::Model> (FrontEnd::*)(InputModel::Ptr) const>(&FrontEnd::convert),
+            static_cast<std::shared_ptr<ov::Model> (FrontEnd::*)(const InputModel::Ptr&) const>(&FrontEnd::convert),
            py::arg("model"),
            R"(
                Completely convert and normalize entire function, throws if it is not possible.
@ -58,7 +58,7 @@ void regclass_frontend_FrontEnd(py::module m) {
             )");

    fem.def("convert",
-            static_cast<void (FrontEnd::*)(std::shared_ptr<ov::Model>) const>(&FrontEnd::convert),
+            static_cast<void (FrontEnd::*)(const std::shared_ptr<ov::Model>&) const>(&FrontEnd::convert),
            py::arg("function"),
            R"(
                Completely convert the remaining, not converted part of a function.
@ -143,26 +143,3 @@ void regclass_frontend_FrontEnd(py::module m) {
        return "<FrontEnd '" + self.get_name() + "'>";
    });
 }
-
-void regclass_frontend_Extension(py::module m) {
-    py::class_<ov::Extension, std::shared_ptr<ov::Extension>> ext(m, "Extension", py::dynamic_attr());
-}
-
-void regclass_frontend_TelemetryExtension(py::module m) {
-    {
-        py::class_<TelemetryExtension, std::shared_ptr<TelemetryExtension>, ov::Extension> ext(m,
-                                                                                               "TelemetryExtension",
-                                                                                               py::dynamic_attr());
-
-        ext.def(py::init([](const std::string& event_category,
-                            const TelemetryExtension::event_callback& send_event,
-                            const TelemetryExtension::error_callback& send_error,
-                            const TelemetryExtension::error_callback& send_stack_trace) {
-            return std::make_shared<TelemetryExtension>(event_category, send_event, send_error, send_stack_trace);
-        }));
-
-        ext.def("send_event", &TelemetryExtension::send_event);
-        ext.def("send_error", &TelemetryExtension::send_error);
-        ext.def("send_stack_trace", &TelemetryExtension::send_stack_trace);
-    }
-}
--- a/src/bindings/python/src/pyopenvino/frontend/frontend.hpp
+++ b/src/bindings/python/src/pyopenvino/frontend/frontend.hpp
@ -9,5 +9,3 @@
 namespace py = pybind11;

 void regclass_frontend_FrontEnd(py::module m);
-void regclass_frontend_Extension(py::module m);
-void regclass_frontend_TelemetryExtension(py::module m);
--- a/src/bindings/python/src/pyopenvino/frontend/manager.hpp
+++ b/src/bindings/python/src/pyopenvino/frontend/manager.hpp
@ -14,4 +14,3 @@ void regclass_frontend_InitializationFailureFrontEnd(py::module m);
 void regclass_frontend_OpConversionFailureFrontEnd(py::module m);
 void regclass_frontend_OpValidationFailureFrontEnd(py::module m);
 void regclass_frontend_GeneralFailureFrontEnd(py::module m);
-
--- a/src/bindings/python/src/pyopenvino/pyopenvino.cpp
+++ b/src/bindings/python/src/pyopenvino/pyopenvino.cpp
@ -24,6 +24,7 @@
 #include "pyopenvino/core/compiled_model.hpp"
 #include "pyopenvino/core/containers.hpp"
 #include "pyopenvino/core/core.hpp"
+#include "pyopenvino/core/extension.hpp"
 #include "pyopenvino/core/ie_parameter.hpp"
 #include "pyopenvino/core/infer_request.hpp"
 #include "pyopenvino/core/offline_transformations.hpp"
@ -31,6 +32,7 @@
 #include "pyopenvino/core/tensor.hpp"
 #include "pyopenvino/core/variable_state.hpp"
 #include "pyopenvino/core/version.hpp"
+#include "pyopenvino/frontend/extensions.hpp"
 #include "pyopenvino/frontend/frontend.hpp"
 #include "pyopenvino/frontend/inputmodel.hpp"
 #include "pyopenvino/frontend/manager.hpp"
@ -124,6 +126,7 @@ PYBIND11_MODULE(pyopenvino, m) {
    regclass_Parameter(m);
    regclass_AsyncInferQueue(m);
    regclass_ProfilingInfo(m);
+    regclass_Extension(m);

    regclass_frontend_Place(m);
    regclass_frontend_InitializationFailureFrontEnd(m);
@ -131,11 +134,12 @@ PYBIND11_MODULE(pyopenvino, m) {
    regclass_frontend_OpConversionFailureFrontEnd(m);
    regclass_frontend_OpValidationFailureFrontEnd(m);
    regclass_frontend_NotImplementedFailureFrontEnd(m);
-    regclass_frontend_Extension(m);
    regclass_frontend_FrontEndManager(m);
    regclass_frontend_FrontEnd(m);
    regclass_frontend_InputModel(m);
    regclass_frontend_TelemetryExtension(m);
+    regclass_frontend_DecoderTransformationExtension(m);
+    regclass_frontend_JsonConfigExtension(m);

    regmodule_offline_transformations(m);
 }
--- a/src/bindings/python/tests/mock/ov_mock_py_frontend/CMakeLists.txt
+++ b/src/bindings/python/tests/mock/ov_mock_py_frontend/CMakeLists.txt
@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #

-set(TARGET_FE_NAME "mock_py_ov_frontend")
+set(TARGET_FE_NAME "ov_mock_py_frontend")

 file(GLOB_RECURSE LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 file(GLOB_RECURSE LIBRARY_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
--- a/src/bindings/python/tests/mock/ov_mock_py_frontend/mock_py_frontend.cpp
+++ b/src/bindings/python/tests/mock/ov_mock_py_frontend/mock_py_frontend.cpp
--- a/src/bindings/python/tests/mock/ov_mock_py_frontend/mock_py_frontend.hpp
+++ b/src/bindings/python/tests/mock/ov_mock_py_frontend/mock_py_frontend.hpp
@ -9,11 +9,11 @@
 #include "openvino/frontend/visibility.hpp"

 // Defined if we are building the plugin DLL (instead of using it)
-#ifdef mock_py_ov_frontend_EXPORTS
+#ifdef ov_mock_py_frontend_EXPORTS
 #    define MOCK_API OPENVINO_CORE_EXPORTS
 #else
 #    define MOCK_API OPENVINO_CORE_IMPORTS
-#endif  // mock_py_ov_frontend_EXPORTS
+#endif  // ov_mock_py_frontend_EXPORTS

 // OK to have 'using' in mock header

@ -257,13 +257,13 @@ public:
        return false;
    }

-    bool is_equal(Ptr another) const override {
+    bool is_equal(const Ptr& another) const override {
        m_stat.m_is_equal++;
        m_stat.m_lastArgPlace = another;
        return false;
    }

-    bool is_equal_data(Ptr another) const override {
+    bool is_equal_data(const Ptr& another) const override {
        m_stat.m_is_equal_data++;
        m_stat.m_lastArgPlace = another;
        return false;
@ -471,19 +471,19 @@ public:
        return std::make_shared<PlaceMockPy>();
    }

-    void set_name_for_tensor(Place::Ptr tensor, const std::string& newName) override {
+    void set_name_for_tensor(const Place::Ptr& tensor, const std::string& newName) override {
        m_stat.m_set_name_for_tensor++;
        m_stat.m_lastArgPlace = tensor;
        m_stat.m_lastArgString = newName;
    }

-    void add_name_for_tensor(Place::Ptr tensor, const std::string& newName) override {
+    void add_name_for_tensor(const Place::Ptr& tensor, const std::string& newName) override {
        m_stat.m_add_name_for_tensor++;
        m_stat.m_lastArgPlace = tensor;
        m_stat.m_lastArgString = newName;
    }

-    void set_name_for_operation(Place::Ptr operation, const std::string& newName) override {
+    void set_name_for_operation(const Place::Ptr& operation, const std::string& newName) override {
        m_stat.m_set_name_for_operation++;
        m_stat.m_lastArgPlace = operation;
        m_stat.m_lastArgString = newName;
@ -499,32 +499,32 @@ public:
        m_stat.m_lastArgString = name;
    }

-    void set_name_for_dimension(Place::Ptr place, size_t shapeDimIndex, const std::string& dimName) override {
+    void set_name_for_dimension(const Place::Ptr& place, size_t shapeDimIndex, const std::string& dimName) override {
        m_stat.m_set_name_for_dimension++;
        m_stat.m_lastArgPlace = place;
        m_stat.m_lastArgInt = static_cast<int>(shapeDimIndex);
        m_stat.m_lastArgString = dimName;
    }

-    void cut_and_add_new_input(Place::Ptr place, const std::string& newNameOptional) override {
+    void cut_and_add_new_input(const Place::Ptr& place, const std::string& newNameOptional) override {
        m_stat.m_cut_and_add_new_input++;
        m_stat.m_lastArgPlace = place;
        m_stat.m_lastArgString = newNameOptional;
    }

-    void cut_and_add_new_output(Place::Ptr place, const std::string& newNameOptional) override {
+    void cut_and_add_new_output(const Place::Ptr& place, const std::string& newNameOptional) override {
        m_stat.m_cut_and_add_new_output++;
        m_stat.m_lastArgPlace = place;
        m_stat.m_lastArgString = newNameOptional;
    }

-    Place::Ptr add_output(Place::Ptr place) override {
+    Place::Ptr add_output(const Place::Ptr& place) override {
        m_stat.m_add_output++;
        m_stat.m_lastArgPlace = place;
        return std::make_shared<PlaceMockPy>();
    }

-    void remove_output(Place::Ptr place) override {
+    void remove_output(const Place::Ptr& place) override {
        m_stat.m_remove_output++;
        m_stat.m_lastArgPlace = place;
    }
@ -546,19 +546,19 @@ public:
    }

    // Setting tensor properties
-    void set_partial_shape(Place::Ptr place, const ngraph::PartialShape& shape) override {
+    void set_partial_shape(const Place::Ptr& place, const ngraph::PartialShape& shape) override {
        m_stat.m_set_partial_shape++;
        m_stat.m_lastArgPlace = place;
        m_stat.m_lastArgPartialShape = shape;
    }

-    ngraph::PartialShape get_partial_shape(Place::Ptr place) const override {
+    ngraph::PartialShape get_partial_shape(const Place::Ptr& place) const override {
        m_stat.m_get_partial_shape++;
        m_stat.m_lastArgPlace = place;
        return {};
    }

-    void set_element_type(Place::Ptr place, const ngraph::element::Type& type) override {
+    void set_element_type(const Place::Ptr& place, const ngraph::element::Type& type) override {
        m_stat.m_set_element_type++;
        m_stat.m_lastArgPlace = place;
        m_stat.m_lastArgElementType = type;
@ -631,26 +631,26 @@ public:
        return false;
    }

-    std::shared_ptr<ov::Model> convert(InputModel::Ptr model) const override {
+    std::shared_ptr<ov::Model> convert(const InputModel::Ptr& model) const override {
        m_stat.m_convert_model++;
        return std::make_shared<ov::Model>(ov::NodeVector{}, ov::ParameterVector{});
    }

-    void convert(std::shared_ptr<ov::Model> func) const override {
+    void convert(const std::shared_ptr<ov::Model>& func) const override {
        m_stat.m_convert++;
    }

-    std::shared_ptr<ov::Model> convert_partially(InputModel::Ptr model) const override {
+    std::shared_ptr<ov::Model> convert_partially(const InputModel::Ptr& model) const override {
        m_stat.m_convert_partially++;
        return std::make_shared<ov::Model>(ov::NodeVector{}, ov::ParameterVector{});
    }

-    std::shared_ptr<ov::Model> decode(InputModel::Ptr model) const override {
+    std::shared_ptr<ov::Model> decode(const InputModel::Ptr& model) const override {
        m_stat.m_decode++;
        return std::make_shared<ov::Model>(ov::NodeVector{}, ov::ParameterVector{});
    }

-    void normalize(std::shared_ptr<ov::Model> function) const override {
+    void normalize(const std::shared_ptr<ov::Model>& function) const override {
        m_stat.m_normalize++;
    }

--- a/src/bindings/python/tests/mock/pyngraph_fe_mock_api/CMakeLists.txt
+++ b/src/bindings/python/tests/mock/pyngraph_fe_mock_api/CMakeLists.txt
@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #

-set(TARGET_FE_NAME "mock_py_ov_frontend")
+set(TARGET_FE_NAME "ov_mock_py_frontend")
 set(PYBIND_FE_NAME "pybind_mock_frontend")

 set(PYBIND_FE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/pyngraph_mock_frontend_api.cpp)
--- a/src/bindings/python/tests/mock/pyngraph_fe_mock_api/pyngraph_mock_frontend_api.cpp
+++ b/src/bindings/python/tests/mock/pyngraph_fe_mock_api/pyngraph_mock_frontend_api.cpp
@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

-#include "../mock_py_ov_frontend/mock_py_frontend.hpp"
+#include "../ov_mock_py_frontend/mock_py_frontend.hpp"

 namespace py = pybind11;
 using namespace ngraph;
--- a/src/bindings/python/tests/test_frontend/test_frontend_onnx_editor.py
+++ b/src/bindings/python/tests/test_frontend/test_frontend_onnx_editor.py
@ -51,7 +51,7 @@ from openvino.frontend import FrontEndManager
 def create_test_onnx_models():
    models = {}
    # Input model 1
-    add = onnx.helper.make_node("Add", inputs=["in1", "in2"], outputs=["add_out"])
+    add = onnx.helper.make_node("Add", inputs=["in1", "in2"], outputs=["add_out"], name="onnx_add_op")
    split = onnx.helper.make_node("Split", inputs=["add_out"],
                                  outputs=["out1", "out2"], name="split1", axis=0)
    relu = onnx.helper.make_node("Relu", inputs=["in3"], outputs=["out3"])
@ -1205,3 +1205,48 @@ def test_set_name_for_dimension():
    with pytest.raises(Exception) as e:
        model.set_name_for_dimension(one_const, 0, dim_name)
    assert "ONNX initializer shape dimension cannot be dynamic." in str(e)
+
+
+def test_set_input_partial_shape_using_input_edge():
+    skip_if_onnx_frontend_is_disabled()
+    fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME)
+    model = fe.load("input_model.onnx")
+
+    add_operator = model.get_place_by_operation_name("onnx_add_op")
+    add_input_edge = add_operator.get_input_port(inputPortIndex=0)
+    model.set_partial_shape(add_input_edge, PartialShape([10, 10]))
+    add_input_edge = add_operator.get_input_port(inputPortIndex=1)
+    model.set_partial_shape(add_input_edge, PartialShape([1]))
+
+    ov_model = fe.convert(model)
+    assert ov_model.input("in1").get_partial_shape() == PartialShape([10, 10])
+    assert ov_model.input("in2").get_partial_shape() == PartialShape([1])
+
+    assert ov_model.output("out4").get_partial_shape() == PartialShape([10, 10])
+
+
+def test_get_partial_shape_using_input_edge():
+    skip_if_onnx_frontend_is_disabled()
+    fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME)
+    model = fe.load("input_model.onnx")
+
+    add_operator = model.get_place_by_operation_name("onnx_add_op")
+    add_input_edge = add_operator.get_input_port(inputPortIndex=0)
+
+    pshape = model.get_partial_shape(add_input_edge)
+    assert pshape == PartialShape([2, 2])
+
+
+def test_get_partial_shape_using_output_edge():
+    skip_if_onnx_frontend_is_disabled()
+    fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME)
+    model = fe.load("input_model.onnx")
+
+    add_operator = model.get_place_by_operation_name("onnx_add_op")
+    add_output_edge = add_operator.get_output_port(outputPortIndex=0)
+
+    assert model.get_partial_shape(add_output_edge) == PartialShape([2, 2])
+
+    split_operator = model.get_place_by_tensor_name("out1").get_producing_operation()
+    out2_edge = split_operator.get_output_port(outputPortIndex=1)
+    assert model.get_partial_shape(out2_edge) == PartialShape([1, 2])
--- a/Show More
+++ b/Show More